src/modules/search_storage.py

	search_storage.py revision 3171
1505N/A#!/usr/bin/python
1505N/A#
1505N/A# CDDL HEADER START
1505N/A#
1505N/A# The contents of this file are subject to the terms of the
1505N/A# Common Development and Distribution License (the "License").
1505N/A# You may not use this file except in compliance with the License.
1505N/A#
1505N/A# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
1505N/A# or http://www.opensolaris.org/os/licensing.
1505N/A# See the License for the specific language governing permissions
1505N/A# and limitations under the License.
1505N/A#
1505N/A# When distributing Covered Code, include this CDDL HEADER in each
1505N/A# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
1505N/A# If applicable, add the following below this CDDL HEADER, with the
1505N/A# fields enclosed by brackets "[]" replaced with your own identifying
1505N/A# information: Portions Copyright [yyyy] [name of copyright owner]
1505N/A#
1505N/A# CDDL HEADER END
1505N/A#
1505N/A
2608N/A#
3158N/A# Copyright (c) 2010, 2015, Oracle and/or its affiliates. All rights reserved.
2608N/A#
1505N/A
1505N/Aimport os
1505N/Aimport errno
3234N/Aimport time
3234N/Aimport hashlib
3234N/Aimport urllib
3234N/A
3245N/Aimport pkg.fmri as fmri
3234N/Aimport pkg.search_errors as search_errors
3245N/Aimport pkg.portable as portable
2639N/Afrom pkg.misc import PKG_FILE_BUFSIZ
2925N/A
1505N/AFAST_ADD = 'fast_add.v1'
1505N/AFAST_REMOVE = 'fast_remove.v1'
2690N/AMANIFEST_LIST = 'manf_list.v1'
1505N/AFULL_FMRI_FILE = 'full_fmri_list'
1505N/AMAIN_FILE = 'main_dict.ascii.v2'
1505N/ABYTE_OFFSET_FILE = 'token_byte_offset.v1'
2925N/AFULL_FMRI_HASH_FILE = 'full_fmri_list.hash'
1505N/AFMRI_OFFSETS_FILE = 'fmri_offsets.v1'
1505N/A
2925N/Adef consistent_open(data_list, directory, timeout = 1):
2931N/A        """Opens all data holders in data_list and ensures that the
2931N/A        versions are consistent among all of them.
2931N/A        It retries several times in case a race condition between file
2931N/A        migration and open is encountered.
2925N/A        Note: Do not set timeout to be 0. It will cause an exception to be
2925N/A        immediately raised.
2925N/A        """
2925N/A
2925N/A        missing = None
2925N/A        cur_version = None
2925N/A
2925N/A        start_time = time.time()
2925N/A
2925N/A        while cur_version == None and missing != True:
2925N/A                # The assignments to cur_version and missing cannot be
2925N/A                # placed here. They must be reset prior to breaking out of the
2925N/A                # for loop so that the while loop condition will be true. They
2925N/A                # cannot be placed after the for loop since that path is taken
2925N/A                # when all files are missing or opened successfully.
2925N/A                if timeout != None and ((time.time() - start_time) > timeout):
2925N/A                        raise search_errors.InconsistentIndexException(
2925N/A                            directory)
2925N/A                for d in data_list:
2925N/A                        # All indexes must have the same version and all must
2925N/A                        # either be present or absent for a successful return.
2925N/A                        # If one of these conditions is not met, the function
2925N/A                        # tries again until it succeeds or the time spent in
2925N/A                        # in the function is greater than timeout.
2925N/A                        try:
2925N/A                                f = os.path.join(directory, d.get_file_name())
2925N/A                                fh = open(f, 'rb')
2925N/A                                # If we get here, then the current index file
2925N/A                                # is present.
2925N/A                                if missing == None:
2925N/A                                        missing = False
2925N/A                                elif missing:
2925N/A                                        for dl in data_list:
2925N/A                                                dl.close_file_handle()
2925N/A                                        missing = None
2925N/A                                        cur_version = None
2925N/A                                        break
2925N/A                                d.set_file_handle(fh, f)
2925N/A                                version_tmp = fh.readline()
2925N/A                                version_num = \
2925N/A                                    int(version_tmp.split(' ')[1].rstrip('\n'))
2925N/A                                # Read the version. If this is the first file,
2925N/A                                # set the expected version otherwise check that
2925N/A                                # the version matches the expected version.
1505N/A                                if cur_version == None:
1505N/A                                        cur_version = version_num
1505N/A                                elif not (cur_version == version_num):
2608N/A                                        # Got inconsistent versions, so close
2925N/A                                        # all files and try again.
2925N/A                                        for d in data_list:
2925N/A                                                d.close_file_handle()
2925N/A                                        missing = None
2925N/A                                        cur_version = None
2925N/A                                        break
2925N/A                        except IOError as e:
1505N/A                                if e.errno == errno.ENOENT:
2690N/A                                        # If the index file is missing, ensure
2690N/A                                        # that previous files were missing as
2690N/A                                        # well. If not, try again.
2690N/A                                        if missing == False:
2925N/A                                                for d in data_list:
2925N/A                                                        d.close_file_handle()
2925N/A                                                missing = None
3234N/A                                                cur_version = None
2925N/A                                                break
2925N/A                                        missing = True
3234N/A                                else:
2925N/A                                        for d in data_list:
2690N/A                                                d.close_file_handle()
2690N/A                                        raise
2690N/A        if missing:
2690N/A                assert cur_version == None
2690N/A                # The index is missing (ie, no files were present).
2925N/A                return None
2925N/A        else:
2925N/A                assert cur_version is not None
2925N/A                return cur_version
2925N/A
2925N/A
2925N/Aclass IndexStoreBase(object):
2925N/A        """Base class for all data storage used by the indexer and
2925N/A        queryEngine. All members must have a file name and maintain
2925N/A        an internal file handle to that file as instructed by external
2925N/A        calls.
2925N/A        """
2925N/A
2925N/A        def __init__(self, file_name):
2925N/A                self._name = file_name
2925N/A                self._file_handle = None
2925N/A                self._file_path = None
3245N/A                self._size = None
2925N/A                self._mtime = None
2925N/A                self._inode = None
2925N/A                self._have_read = False
2925N/A
2925N/A        def get_file_name(self):
2925N/A                return self._name
3245N/A
2925N/A        def set_file_handle(self, f_handle, f_path):
2925N/A                if self._file_handle:
2925N/A                        raise RuntimeError("setting an extant file handle, "
2925N/A                            "must close first, fp is: " + f_path)
2925N/A                else:
2925N/A                        self._file_handle = f_handle
3245N/A                        self._file_path = f_path
2925N/A                        if self._mtime is None:
3245N/A                                stat_info = os.stat(self._file_path)
2925N/A                                self._mtime = stat_info.st_mtime
2925N/A                                self._size = stat_info.st_size
3253N/A                                self._inode = stat_info.st_ino
3253N/A
3253N/A        def get_file_path(self):
2925N/A                return self._file_path
2925N/A
2925N/A        def __copy__(self):
2925N/A                return self.__class__(self._name)
2925N/A
2925N/A        def close_file_handle(self):
2925N/A                """Closes the file handle and clears it so that it cannot
2925N/A                be reused.
2925N/A                """
2925N/A
2925N/A                if self._file_handle:
2925N/A                        self._file_handle.close()
2925N/A                        self._file_handle = None
2925N/A
2925N/A        def _protected_write_dict_file(self, path, version_num, iterable):
2925N/A                """Writes the dictionary in the expected format.
2925N/A                Note: Only child classes should call this method.
2925N/A                """
2925N/A                version_string = "VERSION: "
2925N/A                file_handle = open(os.path.join(path, self._name), 'wb')
2925N/A                file_handle.write(version_string + str(version_num) + "\n")
2925N/A                for name in iterable:
2925N/A                        file_handle.write(str(name) + "\n")
2925N/A                file_handle.close()
2925N/A
2925N/A        def should_reread(self):
2925N/A                """This method uses the modification time and the file size
2925N/A                to (heuristically) determine whether the file backing this
2925N/A                storage has changed since it was last read.
2925N/A                """
2925N/A                stat_info = os.stat(self._file_path)
2925N/A                if self._inode != stat_info.st_ino or \
2925N/A                    self._mtime != stat_info.st_mtime or \
2925N/A                    self._size != stat_info.st_size:
3245N/A                        return True
3245N/A                return not self._have_read
3245N/A
2925N/A        def read_dict_file(self):
2925N/A                self._have_read = True
2925N/A
2925N/A        def open(self, directory):
2925N/A                """This uses consistent open to ensure that the version line
2925N/A                processing is done consistently and that only a single function
2925N/A                actually opens files stored using this class.
2925N/A                """
2925N/A                return consistent_open([self], directory)
2925N/A
2925N/A
2925N/Aclass IndexStoreMainDict(IndexStoreBase):
2925N/A        """Class for representing the main dictionary file
2925N/A        """
2925N/A        # Here is an example of a line from the main dictionary, it is
2925N/A        # explained below:
2925N/A        # %25gconf.xml file!basename@basename#579,13249,13692,77391,77628
2925N/A        #
2925N/A        # Each line begins with a urllib quoted search token. It's followed by
2925N/A        # a set of space separated lists.  Each of these lists begin with an
2925N/A        # action type.  It's separated from its sublist by a '!'.  Next is the
2925N/A        # key type, which is separated from its sublist by a '@'.  Next is the
2925N/A        # full value, which is used in set actions to hold the full value which
2690N/A        # matched the token.  It's separated from its sublist by a '#'.  The
1505N/A        # next token (579) is the fmri id.  The subsequent comma separated
1505N/A        # values are the byte offsets into that manifest of the lines containing
2925N/A        # that token.
3158N/A
2925N/A        sep_chars = [" ", "!", "@", "#", ","]
2925N/A
1505N/A        def __init__(self, file_name):
1505N/A                IndexStoreBase.__init__(self, file_name)
1505N/A                self._old_suffix = None
2690N/A
2925N/A        def write_dict_file(self, path, version_num):
2925N/A                """This class relies on external methods to write the file.
2925N/A                Making this empty call to protected_write_dict_file allows the
2925N/A                file to be set up correctly with the version number stored
2931N/A                correctly.
2931N/A                """
2931N/A                IndexStoreBase._protected_write_dict_file(self, path,
2925N/A                                                          version_num, [])
2925N/A
2931N/A        def get_file_handle(self):
2931N/A                """Return the file handle. Note that doing
2931N/A                anything other than sequential reads or writes
3245N/A                to or from this file_handle may result in unexpected
2925N/A                behavior. In short, don't use seek.
2925N/A                """
2925N/A                return self._file_handle
2925N/A
2925N/A        @staticmethod
2925N/A        def parse_main_dict_line(line):
3245N/A                """Parses one line of a main dictionary file.
2925N/A                Changes to this function must be paired with changes to
2925N/A                write_main_dict_line below.
2925N/A
2925N/A                This should produce the same data structure that
3245N/A                _write_main_dict_line in indexer.py creates to write out each
2925N/A                line.
2925N/A                """
1505N/A
3194N/A                split_chars = IndexStoreMainDict.sep_chars
1505N/A                line = line.rstrip('\n')
1505N/A                tmp = line.split(split_chars[0])
3194N/A                tok = urllib.unquote(tmp[0])
1505N/A                atl = tmp[1:]
2925N/A                res = []
2925N/A                for ati in atl:
2925N/A                        tmp = ati.split(split_chars[1])
2925N/A                        action_type = tmp[0]
2925N/A                        stl = tmp[1:]
2925N/A                        at_res = []
2925N/A                        for sti in stl:
2925N/A                                tmp = sti.split(split_chars[2])
2925N/A                                subtype = tmp[0]
2925N/A                                fvl = tmp[1:]
2925N/A                                st_res = []
2925N/A                                for fvi in fvl:
2925N/A                                        tmp = fvi.split(split_chars[3])
2925N/A                                        full_value = urllib.unquote(tmp[0])
2925N/A                                        pfl = tmp[1:]
2925N/A                                        fv_res = []
2925N/A                                        for pfi in pfl:
2925N/A                                                tmp = pfi.split(split_chars[4])
2608N/A                                                pfmri_index = int(tmp[0])
2925N/A                                                offsets = [
2925N/A                                                    int(t) for t in tmp[1:]
1505N/A                                                ]
2925N/A                                                fv_res.append(
2925N/A                                                    (pfmri_index, offsets))
2925N/A                                        st_res.append((full_value, fv_res))
2925N/A                                at_res.append((subtype, st_res))
2925N/A                        res.append((action_type, at_res))
2925N/A                return tok, res
2925N/A
2925N/A        @staticmethod
2925N/A        def parse_main_dict_line_for_token(line):
2925N/A                """Pulls the token out of a line from a main dictionary file.
2925N/A                Changes to this function must be paired with changes to
2925N/A                write_main_dict_line below.
2925N/A                """
2925N/A
2925N/A                line = line.rstrip("\n")
1505N/A                lst = line.split(" ", 1)
3194N/A                return urllib.unquote(lst[0])
1505N/A
1505N/A        @staticmethod
2925N/A        def transform_main_dict_line(token, entries):
1505N/A                """Paired with parse_main_dict_line above.  Transforms a token
2608N/A                and its data into the string which can be written to the main
2925N/A                dictionary.
2925N/A
3070N/A                The "token" parameter is the token whose index line is being
3070N/A                generated.
3070N/A
3070N/A                The "entries" parameter is a list of lists of lists and so on.
2925N/A                It contains information about where and how "token" was seen in
2925N/A                manifests.  The depth of all lists at each level must be
2925N/A                consistent, and must match the length of "sep_chars" and
2925N/A                "quote".  The details of the contents on entries are described
2925N/A                in _write_main_dict_line in indexer.py.
2925N/A                """
2925N/A                sep_chars = IndexStoreMainDict.sep_chars
2925N/A                res = "{0}".format(urllib.quote(str(token)))
2925N/A                for ati, atl in enumerate(entries):
3194N/A                        action_type, atl = atl
2925N/A                        res += "{0}{1}".format(sep_chars[0], action_type)
2925N/A                        for sti, stl in enumerate(atl):
2925N/A                                subtype, stl = stl
3194N/A                                res += "{0}{1}".format(sep_chars[1], subtype)
2925N/A                                for fvi, fvl in enumerate(stl):
2925N/A                                        full_value, fvl = fvl
2925N/A                                        res += "{0}{1}".format(sep_chars[2],
1505N/A                                            urllib.quote(str(full_value)))
2925N/A                                        for pfi, pfl in enumerate(fvl):
2925N/A                                                pfmri_index, pfl = pfl
2925N/A                                                res += "{0}{1}".format(sep_chars[3],
2925N/A                                                    pfmri_index)
2925N/A                                                for offset in pfl:
2925N/A                                                        res += "{0}{1}".format(
2925N/A                                                            sep_chars[4],
2925N/A                                                            offset)
2925N/A                return res + "\n"
2925N/A
2925N/A        def count_entries_removed_during_partial_indexing(self):
2925N/A                """Returns the number of entries removed during a second phase
2925N/A                of indexing.
2925N/A                """
2925N/A                # This returns 0 because this class is not responsible for
2925N/A                # storing anything in memory.
2925N/A                return 0
2925N/A
2925N/A        def shift_file(self, use_dir, suffix):
2925N/A                """Moves the existing file with self._name in directory
2925N/A                use_dir to a new file named self._name + suffix in directory
2925N/A                use_dir. If it has done this previously, it removes the old
2925N/A                file it moved. It also opens the newly moved file and uses
2925N/A                that as the file for its file handle.
2925N/A                """
2925N/A                assert self._file_handle is None
2925N/A                orig_path = os.path.join(use_dir, self._name)
1505N/A                new_path = os.path.join(use_dir, self._name + suffix)
1505N/A                portable.rename(orig_path, new_path)
2925N/A                tmp_name = self._name
2945N/A                self._name = self._name + suffix
1505N/A                self.open(use_dir)
2639N/A                self._name = tmp_name
2639N/A                if self._old_suffix is not None:
2639N/A                        os.remove(os.path.join(use_dir, self._old_suffix))
2925N/A                self._old_suffix = self._name + suffix
2925N/A
2925N/A
2925N/Aclass IndexStoreListDict(IndexStoreBase):
2925N/A        """Used when both a list and a dictionary are needed to
2925N/A        store the information. Used for bidirectional lookup when
2925N/A        one item is an int (an id) and the other is not (an entity). It
2925N/A        maintains a list of empty spots in the list so that adding entities
2925N/A        can take advantage of unused space. It encodes empty space as a blank
2925N/A        line in the file format and '' in the internal list.
2925N/A        """
2925N/A
2925N/A        def __init__(self, file_name, build_function=lambda x: x,
2925N/A            decode_function=lambda x: x):
2925N/A                IndexStoreBase.__init__(self, file_name)
2925N/A                self._list = []
2925N/A                self._dict = {}
2925N/A                self._next_id = 0
2925N/A                self._list_of_empties = []
2925N/A                self._decode_func = decode_function
2925N/A                self._build_func = build_function
2925N/A                self._line_cnt = 0
2925N/A
2925N/A        def add_entity(self, entity, is_empty):
2925N/A                """Adds an entity consistently to the list and dictionary
2925N/A                allowing bidirectional lookup.
2925N/A                """
2925N/A                assert (len(self._list) == self._next_id)
2925N/A                if self._list_of_empties and not is_empty:
2925N/A                        use_id = self._list_of_empties.pop(0)
2925N/A                        assert use_id <= len(self._list)
2925N/A                        if use_id == len(self._list):
2925N/A                                self._list.append(entity)
2925N/A                                self._next_id += 1
2482N/A                        else:
2925N/A                                self._list[use_id] = entity
2925N/A                else:
2482N/A                        use_id = self._next_id
2482N/A                        self._list.append(entity)
2925N/A                        self._next_id += 1
2925N/A                if not(is_empty):
2925N/A                        self._dict[entity] = use_id
2925N/A                assert (len(self._list) == self._next_id)
2925N/A                return use_id
2925N/A
2925N/A        def remove_id(self, in_id):
2925N/A                """deletes in_id from the list and the dictionary """
3194N/A                entity = self._list[in_id]
3194N/A                self._list[in_id] = ""
2925N/A                self._dict[entity] = ""
2925N/A
1505N/A        def remove_entity(self, entity):
1505N/A                """deletes the entity from the list and the dictionary """
2925N/A                in_id = self._dict[entity]
2925N/A                self._dict[entity] = ""
2925N/A                self._list[in_id] = ""
2925N/A
2925N/A        def get_id(self, entity):
2925N/A                """returns the id of entity """
2925N/A                return self._dict[entity]
3194N/A
2925N/A        def get_id_and_add(self, entity):
2925N/A                """Adds entity if it's not previously stored and returns the
2925N/A                id for entity.
1505N/A                """
1505N/A                # This code purposefully reimplements add_entity
1505N/A                # code. Replacing the function calls to has_entity, add_entity,
1505N/A                # and get_id with direct access to the data structure gave a
1505N/A                # speed up of a factor of 4. Because this is a very hot path,
1505N/A                # the tradeoff seemed appropriate.
1505N/A
2925N/A                if not self._dict.has_key(entity):
2925N/A                        assert (len(self._list) == self._next_id)
3234N/A                        if self._list_of_empties:
2925N/A                                use_id = self._list_of_empties.pop(0)
3234N/A                                assert use_id <= len(self._list)
2925N/A                                if use_id == len(self._list):
2925N/A                                        self._list.append(entity)
2925N/A                                        self._next_id += 1
2925N/A                                else:
2925N/A                                        self._list[use_id] = entity
1505N/A                        else:
1505N/A                                use_id = self._next_id
1505N/A                                self._list.append(entity)
1505N/A                                self._next_id += 1
1505N/A                        self._dict[entity] = use_id
1505N/A                assert (len(self._list) == self._next_id)
1505N/A                return self._dict[entity]
2925N/A
2925N/A        def get_entity(self, in_id):
2925N/A                """return the entity in_id maps to """
2925N/A                return self._list[in_id]
2925N/A
2925N/A        def has_entity(self, entity):
2925N/A                """check if entity is in storage """
2925N/A                return self._dict.has_key(entity)
2925N/A
2925N/A        def has_empty(self):
2925N/A                """Check if the structure has any empty elements which
2925N/A                can be filled with data.
2925N/A                """
2925N/A                return (len(self._list_of_empties) > 0)
2925N/A
2925N/A        def get_next_empty(self):
2925N/A                """returns the next id which maps to no element """
2925N/A                return self._list_of_empties.pop()
2925N/A
2925N/A        def write_dict_file(self, path, version_num):
2925N/A                """Passes self._list to the parent class to write to a file.
2925N/A                """
2925N/A                IndexStoreBase._protected_write_dict_file(self, path,
1505N/A                    version_num, (self._decode_func(l) for l in self._list))
3234N/A        def read_dict_file(self):
1505N/A                """Reads in a dictionary previously stored using the above
1505N/A                call
1505N/A                """
1505N/A                assert self._file_handle
1505N/A                self._dict.clear()
1505N/A                self._list = []
1505N/A                for i, line in enumerate(self._file_handle):
1505N/A                        # A blank line means that id can be reused.
1505N/A                        tmp = self._build_func(line.rstrip("\n"))
1505N/A                        if line == "\n":
2925N/A                                self._list_of_empties.append(i)
2925N/A                        else:
2925N/A                                self._dict[tmp] = i
2925N/A                        self._list.append(tmp)
2925N/A                        self._line_cnt = i + 1
1505N/A                        self._next_id = i + 1
1505N/A                IndexStoreBase.read_dict_file(self)
2925N/A                return self._line_cnt
2925N/A
2925N/A        def count_entries_removed_during_partial_indexing(self):
2925N/A                """Returns the number of entries removed during a second phase
2925N/A                of indexing.
2925N/A                """
2925N/A                return len(self._list)
2925N/A
2925N/Aclass IndexStoreDict(IndexStoreBase):
2925N/A        """Class used when only entity -> id lookup is needed
2925N/A        """
2925N/A
2925N/A        def __init__(self, file_name):
2925N/A                IndexStoreBase.__init__(self, file_name)
2925N/A                self._dict = {}
2925N/A                self._next_id = 0
2925N/A
2925N/A        def get_dict(self):
2925N/A                return self._dict
2925N/A
2925N/A        def get_entity(self, in_id):
2925N/A                return self._dict[in_id]
2925N/A
2925N/A        def has_entity(self, entity):
2925N/A                return self._dict.has_key(entity)
2925N/A
2925N/A        def read_dict_file(self):
2639N/A                """Reads in a dictionary stored in line number -> entity
                format
                """
                self._dict.clear()
                for line_cnt, line in enumerate(self._file_handle):
                        line = line.rstrip("\n")
                        self._dict[line_cnt] = line
                IndexStoreBase.read_dict_file(self)

        def count_entries_removed_during_partial_indexing(self):
                """Returns the number of entries removed during a second phase
                of indexing.
                """
                return len(self._dict)

class IndexStoreDictMutable(IndexStoreBase):
        """Dictionary which allows dynamic update of its storage
        """

        def __init__(self, file_name):
                IndexStoreBase.__init__(self, file_name)
                self._dict = {}

        def get_dict(self):
                return self._dict

        def has_entity(self, entity):
                return self._dict.has_key(entity)

        def get_id(self, entity):
                return self._dict[entity]

        def get_keys(self):
                return self._dict.keys()

        @staticmethod
        def __quote(str):
                if " " in str:
                        return "1" + urllib.quote(str)
                else:
                        return "0" + str

        def read_dict_file(self):
                """Reads in a dictionary stored in with an entity
                and its number on each line.
                """
                self._dict.clear()
                for line in self._file_handle:
                        token, offset = line.split(" ")
                        if token[0] == "1":
                                token = urllib.unquote(token[1:])
                        else:
                                token = token[1:]
                        offset = int(offset)
                        self._dict[token] = offset
                IndexStoreBase.read_dict_file(self)

        def open_out_file(self, use_dir, version_num):
                """Opens the output file for this class and prepares it
                to be written via write_entity.
                """
                self.write_dict_file(use_dir, version_num)
                self._file_handle = open(os.path.join(use_dir, self._name),
                    'ab', buffering=PKG_FILE_BUFSIZ)

        def write_entity(self, entity, my_id):
                """Writes the entity out to the file with my_id """
                assert self._file_handle is not None
                self._file_handle.write(self.__quote(str(entity)) + " " +
                    str(my_id) + "\n")

        def write_dict_file(self, path, version_num):
                """ Generates an iterable list of string representations of
                the dictionary that the parent's protected_write_dict_file
                function can call.
                """
                IndexStoreBase._protected_write_dict_file(self, path,
                    version_num, [])

        def count_entries_removed_during_partial_indexing(self):
                """Returns the number of entries removed during a second phase
                of indexing.
                """
                return 0

class IndexStoreSetHash(IndexStoreBase):
        def __init__(self, file_name):
                IndexStoreBase.__init__(self, file_name)
                # In order to interoperate with older clients, we must use sha-1
                # here.
                self.hash_val = hashlib.sha1().hexdigest()

        def set_hash(self, vals):
                """Set the has value."""
                self.hash_val = self.calc_hash(vals)

        def calc_hash(self, vals):
                """Calculate the hash value of the sorted members of vals."""
                vl = list(vals)
                vl.sort()
                # In order to interoperate with older clients, we must use sha-1
                # here.
                shasum = hashlib.sha1()
                for v in vl:
                        shasum.update(v)
                return shasum.hexdigest()

        def write_dict_file(self, path, version_num):
                """Write self.hash_val out to a line in a file """
                IndexStoreBase._protected_write_dict_file(self, path,
                    version_num, [self.hash_val])

        def read_dict_file(self):
                """Process a dictionary file written using the above method
                """
                sp = self._file_handle.tell()
                res = 0
                for res, line in enumerate(self._file_handle):
                        assert res < 1
                        self.hash_val = line.rstrip()
                self._file_handle.seek(sp)
                IndexStoreBase.read_dict_file(self)
                return res

        def check_against_file(self, vals):
                """Check the hash value of vals against the value stored
                in the file for this object."""
                if not self._have_read:
                        self.read_dict_file()
                incoming_hash = self.calc_hash(vals)
                if self.hash_val != incoming_hash:
                        raise search_errors.IncorrectIndexFileHash(
                            self.hash_val, incoming_hash)

        def count_entries_removed_during_partial_indexing(self):
                """Returns the number of entries removed during a second phase
                of indexing."""
                return 0

class IndexStoreSet(IndexStoreBase):
        """Used when only set membership is desired.
        This is currently designed for exclusive use
        with storage of fmri.PkgFmris. However, that impact
        is only seen in the read_and_discard_matching_from_argument
        method.
        """
        def __init__(self, file_name):
                IndexStoreBase.__init__(self, file_name)
                self._set = set()

        def get_set(self):
                return self._set

        def clear(self):
                self._set.clear()

        def add_entity(self, entity):
                self._set.add(entity)

        def remove_entity(self, entity):
                """Remove entity purposfully assumes that entity is
                already in the set to be removed. This is useful for
                error checking and debugging.
                """
                self._set.remove(entity)

        def has_entity(self, entity):
                return (entity in self._set)

        def write_dict_file(self, path, version_num):
                """Write each member of the set out to a line in a file """
                IndexStoreBase._protected_write_dict_file(self, path,
                    version_num, self._set)

        def read_dict_file(self):
                """Process a dictionary file written using the above method
                """
                assert self._file_handle
                res = 0
                self._set.clear()
                for i, line in enumerate(self._file_handle):
                        line = line.rstrip("\n")
                        assert i == len(self._set)
                        self.add_entity(line)
                        res = i + 1
                IndexStoreBase.read_dict_file(self)
                return res

        def read_and_discard_matching_from_argument(self, fmri_set):
                """Reads the file and removes all frmis in the file
                from fmri_set.
                """
                if self._file_handle:
                        for line in self._file_handle:
                                f = fmri.PkgFmri(line)
                                fmri_set.discard(f)

        def count_entries_removed_during_partial_indexing(self):
                """Returns the number of entries removed during a second phase
                of indexing."""
                return len(self._set)


class InvertedDict(IndexStoreBase):
        """Class used to store and process fmri to offset mappings.  It does
        delta compression and deduplication of shared offset sets when writing
        to a file."""

        def __init__(self, file_name, p_id_trans):
                """file_name is the name of the file to write to or read from.
                p_id_trans is an object which has a get entity method which,
                when given a package id number returns the PkgFmri object
                for that id number."""

                IndexStoreBase.__init__(self, file_name)
                self._p_id_trans = p_id_trans
                self._dict = {}
                self._fmri_offsets = {}

        def __copy__(self):
                return self.__class__(self._name, self._p_id_trans)

        def add_pair(self, p_id, offset):
                """Adds a package id number and an associated offset to the
                existing dictionary."""

                try:
                        self._fmri_offsets[p_id].append(offset)
                except KeyError:
                        self._fmri_offsets[p_id] = [offset]

        def invert_id_to_offsets_dict(self):
                """Does delta encoding of offsets to reduce space by only
                storing the difference between the current offset and the
                previous offset.  It also performs deduplication so that all
                packages with the same set of offsets share a common bucket."""

                inv = {}
                for p_id in self._fmri_offsets.keys():
                        old_o = 0
                        bucket = []
                        for o in sorted(set(self._fmri_offsets[p_id])):
                                bucket.append(o - old_o)
                                old_o = o
                        h = " ".join([str(o) for o in bucket])
                        del self._fmri_offsets[p_id]
                        if h not in inv:
                                inv[h] = []
                        inv[h].append(p_id)
                return inv

        @staticmethod
        def __make_line(offset_str, p_ids, trans):
                """For a given offset string, a list of package id numbers,
                and a translator from package id numbers to PkgFmris, returns
                the string which represents that information. Its format is
                space separated package fmris, followed by a !, followed by
                space separated offsets which have had delta compression
                performed."""

                return " ".join([
                    trans.get_entity(p_id).get_fmri(anarchy=True,
                        include_scheme=False)
                    for p_id in p_ids
                    ]) + "!" + offset_str

        def write_dict_file(self, path, version_num):
                """Write the mapping of package fmris to offset sets out
                to the file."""

                inv = self.invert_id_to_offsets_dict()
                IndexStoreBase._protected_write_dict_file(self, path,
                    version_num, (
                        self.__make_line(o, inv[o], self._p_id_trans)
                        for o in inv
                    ))

        def read_dict_file(self):
                """Read a file written by the above function and store the
                information in a dictionary."""

                assert self._file_handle
                for l in self._file_handle:
                        fmris, offs = l.split("!")
                        self._dict[fmris] = offs
                IndexStoreBase.read_dict_file(self)

        @staticmethod
        def de_delta(offs):
                """For a list of strings of offsets, undo the delta compression
                that has been performed."""

                old_o = 0
                ret = []
                for o in offs:
                        o = int(o) + old_o
                        ret.append(o)
                        old_o = o
                return ret

        def get_offsets(self, match_func):
                """For a given function which returns true if it matches the
                desired fmri, return the offsets which are associated with the
                fmris which match."""

                offs = []
                for fmris in self._dict.keys():
                        for p in fmris.split():
                                if match_func(p):
                                        offs.extend(self.de_delta(
                                            self._dict[fmris].split()))
                                        break
                return set(offs)