search_storage.py revision 3234
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen# CDDL HEADER START
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen# The contents of this file are subject to the terms of the
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen# Common Development and Distribution License (the "License").
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen# You may not use this file except in compliance with the License.
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen# See the License for the specific language governing permissions
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen# and limitations under the License.
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen# When distributing Covered Code, include this CDDL HEADER in each
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen# If applicable, add the following below this CDDL HEADER, with the
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen# fields enclosed by brackets "[]" replaced with your own identifying
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen# information: Portions Copyright [yyyy] [name of copyright owner]
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen# CDDL HEADER END
f6497ac81e6de57870936d538acccb75ce408fc1Timo Sirainen# Copyright (c) 2010, 2015, Oracle and/or its affiliates. All rights reserved.
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainenfrom six.moves.urllib.parse import quote, unquote
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainendef consistent_open(data_list, directory, timeout = 1):
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen """Opens all data holders in data_list and ensures that the
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen versions are consistent among all of them.
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen It retries several times in case a race condition between file
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen migration and open is encountered.
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen Note: Do not set timeout to be 0. It will cause an exception to be
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen immediately raised.
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen while cur_version == None and missing != True:
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen # The assignments to cur_version and missing cannot be
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen # placed here. They must be reset prior to breaking out of the
4bfa47e475c957adfc645047660d8ce96a3371a7Timo Sirainen # for loop so that the while loop condition will be true. They
4bfa47e475c957adfc645047660d8ce96a3371a7Timo Sirainen # cannot be placed after the for loop since that path is taken
4bfa47e475c957adfc645047660d8ce96a3371a7Timo Sirainen # when all files are missing or opened successfully.
4bfa47e475c957adfc645047660d8ce96a3371a7Timo Sirainen if timeout != None and ((time.time() - start_time) > timeout):
4bfa47e475c957adfc645047660d8ce96a3371a7Timo Sirainen raise search_errors.InconsistentIndexException(
4bfa47e475c957adfc645047660d8ce96a3371a7Timo Sirainen # All indexes must have the same version and all must
4bfa47e475c957adfc645047660d8ce96a3371a7Timo Sirainen # either be present or absent for a successful return.
4bfa47e475c957adfc645047660d8ce96a3371a7Timo Sirainen # If one of these conditions is not met, the function
4bfa47e475c957adfc645047660d8ce96a3371a7Timo Sirainen # tries again until it succeeds or the time spent in
4bfa47e475c957adfc645047660d8ce96a3371a7Timo Sirainen # in the function is greater than timeout.
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen f = os.path.join(directory, d.get_file_name())
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen # If we get here, then the current index file
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen # is present.
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen # Read the version. If this is the first file,
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen # set the expected version otherwise check that
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen # the version matches the expected version.
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen # Got inconsistent versions, so close
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen # all files and try again.
f6ae9ae80a1fcf6c8f45ab759f0074caaa66c9c8Timo Sirainen # If the index file is missing, ensure
f6ae9ae80a1fcf6c8f45ab759f0074caaa66c9c8Timo Sirainen # that previous files were missing as
f6ae9ae80a1fcf6c8f45ab759f0074caaa66c9c8Timo Sirainen # well. If not, try again.
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen assert cur_version == None
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen # The index is missing (ie, no files were present).
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen assert cur_version is not None
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen """Base class for all data storage used by the indexer and
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen queryEngine. All members must have a file name and maintain
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen an internal file handle to that file as instructed by external
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen raise RuntimeError("setting an extant file handle, "
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen """Closes the file handle and clears it so that it cannot
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen def _protected_write_dict_file(self, path, version_num, iterable):
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen """Writes the dictionary in the expected format.
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen Note: Only child classes should call this method.
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen file_handle = open(os.path.join(path, self._name), 'wb')
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen file_handle.write(version_string + str(version_num) + "\n")
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen """This method uses the modification time and the file size
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen to (heuristically) determine whether the file backing this
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen storage has changed since it was last read.
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen """This uses consistent open to ensure that the version line
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen processing is done consistently and that only a single function
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen actually opens files stored using this class.
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen """Class for representing the main dictionary file
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen # Here is an example of a line from the main dictionary, it is
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen # explained below:
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen # %25gconf.xml file!basename@basename#579,13249,13692,77391,77628
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen # Each line begins with a urllib quoted search token. It's followed by
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen # a set of space separated lists. Each of these lists begin with an
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen # action type. It's separated from its sublist by a '!'. Next is the
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen # key type, which is separated from its sublist by a '@'. Next is the
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen # full value, which is used in set actions to hold the full value which
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen # matched the token. It's separated from its sublist by a '#'. The
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen # next token (579) is the fmri id. The subsequent comma separated
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen # values are the byte offsets into that manifest of the lines containing
b1b8ac3b3c667405e1533aa6db26e55218d26cc6Timo Sirainen # that token.
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen """This class relies on external methods to write the file.
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen Making this empty call to protected_write_dict_file allows the
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen file to be set up correctly with the version number stored
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen IndexStoreBase._protected_write_dict_file(self, path,
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen """Return the file handle. Note that doing
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen anything other than sequential reads or writes
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen to or from this file_handle may result in unexpected
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen behavior. In short, don't use seek.
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen """Parses one line of a main dictionary file.
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen Changes to this function must be paired with changes to
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen write_main_dict_line below.
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen This should produce the same data structure that
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen _write_main_dict_line in indexer.py creates to write out each
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen """Pulls the token out of a line from a main dictionary file.
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen Changes to this function must be paired with changes to
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen write_main_dict_line below.
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen """Paired with parse_main_dict_line above. Transforms a token
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen and its data into the string which can be written to the main
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen The "token" parameter is the token whose index line is being
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen The "entries" parameter is a list of lists of lists and so on.
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen It contains information about where and how "token" was seen in
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen manifests. The depth of all lists at each level must be
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen consistent, and must match the length of "sep_chars" and
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen "quote". The details of the contents on entries are described
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen in _write_main_dict_line in indexer.py.
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen res += "{0}{1}".format(sep_chars[0], action_type)
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen def count_entries_removed_during_partial_indexing(self):
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen """Returns the number of entries removed during a second phase
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen # This returns 0 because this class is not responsible for
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen # storing anything in memory.
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen """Moves the existing file with self._name in directory
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen use_dir to a new file named self._name + suffix in directory
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen use_dir. If it has done this previously, it removes the old
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen file it moved. It also opens the newly moved file and uses
736b1800b0409ba7443d33ecb8d0fb9f8b091660Timo Sirainen that as the file for its file handle.
a18503d5dc0751a1f9785e48438a219d95c0b9c2Timo Sirainen new_path = os.path.join(use_dir, self._name + suffix)
736b1800b0409ba7443d33ecb8d0fb9f8b091660Timo Sirainen os.remove(os.path.join(use_dir, self._old_suffix))
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen """Used when both a list and a dictionary are needed to
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen store the information. Used for bidirectional lookup when
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen one item is an int (an id) and the other is not (an entity). It
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen maintains a list of empty spots in the list so that adding entities
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen can take advantage of unused space. It encodes empty space as a blank
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen line in the file format and '' in the internal list.
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen def __init__(self, file_name, build_function=lambda x: x,
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen """Adds an entity consistently to the list and dictionary
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen allowing bidirectional lookup.
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen """deletes in_id from the list and the dictionary """
736b1800b0409ba7443d33ecb8d0fb9f8b091660Timo Sirainen """deletes the entity from the list and the dictionary """
736b1800b0409ba7443d33ecb8d0fb9f8b091660Timo Sirainen """returns the id of entity """
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen """Adds entity if it's not previously stored and returns the
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen id for entity.
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen # This code purposefully reimplements add_entity
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen # code. Replacing the function calls to has_entity, add_entity,
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen # and get_id with direct access to the data structure gave a
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen # speed up of a factor of 4. Because this is a very hot path,
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen # the tradeoff seemed appropriate.
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen """return the entity in_id maps to """
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen """check if entity is in storage """
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen """Check if the structure has any empty elements which
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen can be filled with data.
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen """returns the next id which maps to no element """
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen """Passes self._list to the parent class to write to a file.
f6ae9ae80a1fcf6c8f45ab759f0074caaa66c9c8Timo Sirainen IndexStoreBase._protected_write_dict_file(self, path,
f6ae9ae80a1fcf6c8f45ab759f0074caaa66c9c8Timo Sirainen version_num, (self._decode_func(l) for l in self._list))
f6ae9ae80a1fcf6c8f45ab759f0074caaa66c9c8Timo Sirainen """Reads in a dictionary previously stored using the above
f6ae9ae80a1fcf6c8f45ab759f0074caaa66c9c8Timo Sirainen # A blank line means that id can be reused.
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen def count_entries_removed_during_partial_indexing(self):
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen """Returns the number of entries removed during a second phase
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen """Class used when only entity -> id lookup is needed
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen """Reads in a dictionary stored in line number -> entity
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen for line_cnt, line in enumerate(self._file_handle):
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen def count_entries_removed_during_partial_indexing(self):
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen """Returns the number of entries removed during a second phase
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen """Dictionary which allows dynamic update of its storage
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen """Reads in a dictionary stored in with an entity
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen and its number on each line.
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen def open_out_file(self, use_dir, version_num):
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen """Opens the output file for this class and prepares it
f06cc4cb6542c49430ed96b1a1459a2952d820c3Timo Sirainen to be written via write_entity.
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen self._file_handle = open(os.path.join(use_dir, self._name),
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen """Writes the entity out to the file with my_id """
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen self._file_handle.write(self.__quote(str(entity)) + " " +
f6497ac81e6de57870936d538acccb75ce408fc1Timo Sirainen """ Generates an iterable list of string representations of
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen the dictionary that the parent's protected_write_dict_file
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen function can call.
f6497ac81e6de57870936d538acccb75ce408fc1Timo Sirainen IndexStoreBase._protected_write_dict_file(self, path,
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen def count_entries_removed_during_partial_indexing(self):
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen """Returns the number of entries removed during a second phase
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen # In order to interoperate with older clients, we must use sha-1
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen """Set the has value."""
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen """Calculate the hash value of the sorted members of vals."""
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen # In order to interoperate with older clients, we must use sha-1
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen """Write self.hash_val out to a line in a file """
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen IndexStoreBase._protected_write_dict_file(self, path,
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen """Process a dictionary file written using the above method
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen for res, line in enumerate(self._file_handle):
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen """Check the hash value of vals against the value stored
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen in the file for this object."""
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen def count_entries_removed_during_partial_indexing(self):
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen """Returns the number of entries removed during a second phase
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen of indexing."""
f06cc4cb6542c49430ed96b1a1459a2952d820c3Timo Sirainen """Used when only set membership is desired.
f06cc4cb6542c49430ed96b1a1459a2952d820c3Timo Sirainen This is currently designed for exclusive use
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen with storage of fmri.PkgFmris. However, that impact
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen is only seen in the read_and_discard_matching_from_argument
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen """Remove entity purposfully assumes that entity is
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen already in the set to be removed. This is useful for
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen error checking and debugging.
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen """Write each member of the set out to a line in a file """
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen IndexStoreBase._protected_write_dict_file(self, path,
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen """Process a dictionary file written using the above method
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen def read_and_discard_matching_from_argument(self, fmri_set):
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen """Reads the file and removes all frmis in the file
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen from fmri_set.
except KeyError:
inv = {}
bucket = []
old_o = o
if h not in inv:
inv[h] = []
return inv
version_num, (
for o in inv
ret = []
for o in offs:
old_o = o
return ret
offs = []
if match_func(p):