search_storage.py revision 3234
2454dfa32c93c20a8522c6ed42fe057baaac9f9aStephan Bosch#!/usr/bin/python
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen#
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen# CDDL HEADER START
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen#
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen# The contents of this file are subject to the terms of the
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen# Common Development and Distribution License (the "License").
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen# You may not use this file except in compliance with the License.
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen#
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen# or http://www.opensolaris.org/os/licensing.
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen# See the License for the specific language governing permissions
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen# and limitations under the License.
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen#
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen# When distributing Covered Code, include this CDDL HEADER in each
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen# If applicable, add the following below this CDDL HEADER, with the
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen# fields enclosed by brackets "[]" replaced with your own identifying
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen# information: Portions Copyright [yyyy] [name of copyright owner]
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen#
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen# CDDL HEADER END
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen#
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen#
f6497ac81e6de57870936d538acccb75ce408fc1Timo Sirainen# Copyright (c) 2010, 2015, Oracle and/or its affiliates. All rights reserved.
f6ae9ae80a1fcf6c8f45ab759f0074caaa66c9c8Timo Sirainen#
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainenimport os
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainenimport errno
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainenimport time
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainenimport hashlib
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainenfrom six.moves.urllib.parse import quote, unquote
a18503d5dc0751a1f9785e48438a219d95c0b9c2Timo Sirainen
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainenimport pkg.fmri as fmri
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainenimport pkg.search_errors as search_errors
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainenimport pkg.portable as portable
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainenfrom pkg.misc import PKG_FILE_BUFSIZ
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo SirainenFAST_ADD = 'fast_add.v1'
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo SirainenFAST_REMOVE = 'fast_remove.v1'
86bdb644d147a73df85abce4325254d694217a5fTimo SirainenMANIFEST_LIST = 'manf_list.v1'
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo SirainenFULL_FMRI_FILE = 'full_fmri_list'
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo SirainenMAIN_FILE = 'main_dict.ascii.v2'
4bfa47e475c957adfc645047660d8ce96a3371a7Timo SirainenBYTE_OFFSET_FILE = 'token_byte_offset.v1'
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo SirainenFULL_FMRI_HASH_FILE = 'full_fmri_list.hash'
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo SirainenFMRI_OFFSETS_FILE = 'fmri_offsets.v1'
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainendef consistent_open(data_list, directory, timeout = 1):
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen """Opens all data holders in data_list and ensures that the
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen versions are consistent among all of them.
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen It retries several times in case a race condition between file
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen migration and open is encountered.
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen Note: Do not set timeout to be 0. It will cause an exception to be
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen immediately raised.
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen """
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen missing = None
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen cur_version = None
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen start_time = time.time()
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen while cur_version == None and missing != True:
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen # The assignments to cur_version and missing cannot be
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen # placed here. They must be reset prior to breaking out of the
4bfa47e475c957adfc645047660d8ce96a3371a7Timo Sirainen # for loop so that the while loop condition will be true. They
4bfa47e475c957adfc645047660d8ce96a3371a7Timo Sirainen # cannot be placed after the for loop since that path is taken
4bfa47e475c957adfc645047660d8ce96a3371a7Timo Sirainen # when all files are missing or opened successfully.
4bfa47e475c957adfc645047660d8ce96a3371a7Timo Sirainen if timeout != None and ((time.time() - start_time) > timeout):
4bfa47e475c957adfc645047660d8ce96a3371a7Timo Sirainen raise search_errors.InconsistentIndexException(
4bfa47e475c957adfc645047660d8ce96a3371a7Timo Sirainen directory)
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen for d in data_list:
4bfa47e475c957adfc645047660d8ce96a3371a7Timo Sirainen # All indexes must have the same version and all must
4bfa47e475c957adfc645047660d8ce96a3371a7Timo Sirainen # either be present or absent for a successful return.
4bfa47e475c957adfc645047660d8ce96a3371a7Timo Sirainen # If one of these conditions is not met, the function
4bfa47e475c957adfc645047660d8ce96a3371a7Timo Sirainen # tries again until it succeeds or the time spent in
4bfa47e475c957adfc645047660d8ce96a3371a7Timo Sirainen # in the function is greater than timeout.
4bfa47e475c957adfc645047660d8ce96a3371a7Timo Sirainen try:
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen f = os.path.join(directory, d.get_file_name())
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen fh = open(f, 'rb')
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen # If we get here, then the current index file
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen # is present.
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen if missing == None:
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen missing = False
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen elif missing:
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen for dl in data_list:
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen dl.close_file_handle()
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen missing = None
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen cur_version = None
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen break
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen d.set_file_handle(fh, f)
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen version_tmp = fh.readline()
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen version_num = \
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen int(version_tmp.split(' ')[1].rstrip('\n'))
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen # Read the version. If this is the first file,
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen # set the expected version otherwise check that
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen # the version matches the expected version.
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen if cur_version == None:
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen cur_version = version_num
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen elif not (cur_version == version_num):
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen # Got inconsistent versions, so close
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen # all files and try again.
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen for d in data_list:
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen d.close_file_handle()
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen missing = None
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen cur_version = None
f6ae9ae80a1fcf6c8f45ab759f0074caaa66c9c8Timo Sirainen break
f6ae9ae80a1fcf6c8f45ab759f0074caaa66c9c8Timo Sirainen except IOError as e:
f6ae9ae80a1fcf6c8f45ab759f0074caaa66c9c8Timo Sirainen if e.errno == errno.ENOENT:
f6ae9ae80a1fcf6c8f45ab759f0074caaa66c9c8Timo Sirainen # If the index file is missing, ensure
f6ae9ae80a1fcf6c8f45ab759f0074caaa66c9c8Timo Sirainen # that previous files were missing as
f6ae9ae80a1fcf6c8f45ab759f0074caaa66c9c8Timo Sirainen # well. If not, try again.
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen if missing == False:
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen for d in data_list:
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen d.close_file_handle()
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen missing = None
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen cur_version = None
a18503d5dc0751a1f9785e48438a219d95c0b9c2Timo Sirainen break
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen missing = True
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen else:
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen for d in data_list:
f6ae9ae80a1fcf6c8f45ab759f0074caaa66c9c8Timo Sirainen d.close_file_handle()
f6ae9ae80a1fcf6c8f45ab759f0074caaa66c9c8Timo Sirainen raise
f6ae9ae80a1fcf6c8f45ab759f0074caaa66c9c8Timo Sirainen if missing:
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen assert cur_version == None
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen # The index is missing (ie, no files were present).
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen return None
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen else:
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen assert cur_version is not None
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen return cur_version
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainenclass IndexStoreBase(object):
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen """Base class for all data storage used by the indexer and
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen queryEngine. All members must have a file name and maintain
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen an internal file handle to that file as instructed by external
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen calls.
0f3d4fbcf88e2ffd674893aed8cc1288fe17d290Timo Sirainen """
0f3d4fbcf88e2ffd674893aed8cc1288fe17d290Timo Sirainen
0f3d4fbcf88e2ffd674893aed8cc1288fe17d290Timo Sirainen def __init__(self, file_name):
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen self._name = file_name
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen self._file_handle = None
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen self._file_path = None
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen self._size = None
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen self._mtime = None
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen self._inode = None
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen self._have_read = False
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen def get_file_name(self):
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen return self._name
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen def set_file_handle(self, f_handle, f_path):
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen if self._file_handle:
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen raise RuntimeError("setting an extant file handle, "
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen "must close first, fp is: " + f_path)
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen else:
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen self._file_handle = f_handle
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen self._file_path = f_path
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen if self._mtime is None:
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen stat_info = os.stat(self._file_path)
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen self._mtime = stat_info.st_mtime
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen self._size = stat_info.st_size
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen self._inode = stat_info.st_ino
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen def get_file_path(self):
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen return self._file_path
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen def __copy__(self):
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen return self.__class__(self._name)
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen def close_file_handle(self):
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen """Closes the file handle and clears it so that it cannot
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen be reused.
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen """
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen if self._file_handle:
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen self._file_handle.close()
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen self._file_handle = None
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen def _protected_write_dict_file(self, path, version_num, iterable):
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen """Writes the dictionary in the expected format.
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen Note: Only child classes should call this method.
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen """
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen version_string = "VERSION: "
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen file_handle = open(os.path.join(path, self._name), 'wb')
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen file_handle.write(version_string + str(version_num) + "\n")
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen for name in iterable:
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen file_handle.write(str(name) + "\n")
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen file_handle.close()
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen def should_reread(self):
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen """This method uses the modification time and the file size
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen to (heuristically) determine whether the file backing this
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen storage has changed since it was last read.
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen """
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen stat_info = os.stat(self._file_path)
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen if self._inode != stat_info.st_ino or \
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen self._mtime != stat_info.st_mtime or \
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen self._size != stat_info.st_size:
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen return True
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen return not self._have_read
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen def read_dict_file(self):
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen self._have_read = True
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen def open(self, directory):
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen """This uses consistent open to ensure that the version line
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen processing is done consistently and that only a single function
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen actually opens files stored using this class.
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen """
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen return consistent_open([self], directory)
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainenclass IndexStoreMainDict(IndexStoreBase):
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen """Class for representing the main dictionary file
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen """
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen # Here is an example of a line from the main dictionary, it is
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen # explained below:
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen # %25gconf.xml file!basename@basename#579,13249,13692,77391,77628
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen #
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen # Each line begins with a urllib quoted search token. It's followed by
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen # a set of space separated lists. Each of these lists begin with an
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen # action type. It's separated from its sublist by a '!'. Next is the
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen # key type, which is separated from its sublist by a '@'. Next is the
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen # full value, which is used in set actions to hold the full value which
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen # matched the token. It's separated from its sublist by a '#'. The
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen # next token (579) is the fmri id. The subsequent comma separated
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen # values are the byte offsets into that manifest of the lines containing
b1b8ac3b3c667405e1533aa6db26e55218d26cc6Timo Sirainen # that token.
b1b8ac3b3c667405e1533aa6db26e55218d26cc6Timo Sirainen
b1b8ac3b3c667405e1533aa6db26e55218d26cc6Timo Sirainen sep_chars = [" ", "!", "@", "#", ","]
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen def __init__(self, file_name):
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen IndexStoreBase.__init__(self, file_name)
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen self._old_suffix = None
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen def write_dict_file(self, path, version_num):
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen """This class relies on external methods to write the file.
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen Making this empty call to protected_write_dict_file allows the
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen file to be set up correctly with the version number stored
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen correctly.
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen """
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen IndexStoreBase._protected_write_dict_file(self, path,
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen version_num, [])
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen def get_file_handle(self):
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen """Return the file handle. Note that doing
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen anything other than sequential reads or writes
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen to or from this file_handle may result in unexpected
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen behavior. In short, don't use seek.
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen """
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen return self._file_handle
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen @staticmethod
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen def parse_main_dict_line(line):
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen """Parses one line of a main dictionary file.
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen Changes to this function must be paired with changes to
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen write_main_dict_line below.
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen This should produce the same data structure that
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen _write_main_dict_line in indexer.py creates to write out each
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen line.
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen """
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen split_chars = IndexStoreMainDict.sep_chars
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen line = line.rstrip('\n')
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen tmp = line.split(split_chars[0])
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen tok = unquote(tmp[0])
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen atl = tmp[1:]
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen res = []
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen for ati in atl:
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen tmp = ati.split(split_chars[1])
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen action_type = tmp[0]
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen stl = tmp[1:]
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen at_res = []
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen for sti in stl:
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen tmp = sti.split(split_chars[2])
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen subtype = tmp[0]
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen fvl = tmp[1:]
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen st_res = []
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen for fvi in fvl:
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen tmp = fvi.split(split_chars[3])
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen full_value = unquote(tmp[0])
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen pfl = tmp[1:]
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen fv_res = []
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen for pfi in pfl:
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen tmp = pfi.split(split_chars[4])
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen pfmri_index = int(tmp[0])
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen offsets = [
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen int(t) for t in tmp[1:]
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen ]
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen fv_res.append(
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen (pfmri_index, offsets))
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen st_res.append((full_value, fv_res))
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen at_res.append((subtype, st_res))
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen res.append((action_type, at_res))
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen return tok, res
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen @staticmethod
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen def parse_main_dict_line_for_token(line):
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen """Pulls the token out of a line from a main dictionary file.
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen Changes to this function must be paired with changes to
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen write_main_dict_line below.
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen """
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen line = line.rstrip("\n")
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen lst = line.split(" ", 1)
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen return unquote(lst[0])
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen @staticmethod
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen def transform_main_dict_line(token, entries):
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen """Paired with parse_main_dict_line above. Transforms a token
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen and its data into the string which can be written to the main
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen dictionary.
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen The "token" parameter is the token whose index line is being
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen generated.
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen The "entries" parameter is a list of lists of lists and so on.
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen It contains information about where and how "token" was seen in
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen manifests. The depth of all lists at each level must be
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen consistent, and must match the length of "sep_chars" and
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen "quote". The details of the contents on entries are described
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen in _write_main_dict_line in indexer.py.
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen """
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen sep_chars = IndexStoreMainDict.sep_chars
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen res = "{0}".format(quote(str(token)))
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen for ati, atl in enumerate(entries):
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen action_type, atl = atl
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen res += "{0}{1}".format(sep_chars[0], action_type)
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen for sti, stl in enumerate(atl):
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen subtype, stl = stl
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen res += "{0}{1}".format(sep_chars[1], subtype)
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen for fvi, fvl in enumerate(stl):
2ac5f36aa7c2e7a07ba8815d43a6d7483f62e74cTimo Sirainen full_value, fvl = fvl
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen res += "{0}{1}".format(sep_chars[2],
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen quote(str(full_value)))
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen for pfi, pfl in enumerate(fvl):
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen pfmri_index, pfl = pfl
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen res += "{0}{1}".format(sep_chars[3],
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen pfmri_index)
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen for offset in pfl:
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen res += "{0}{1}".format(
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen sep_chars[4],
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen offset)
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen return res + "\n"
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen def count_entries_removed_during_partial_indexing(self):
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen """Returns the number of entries removed during a second phase
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen of indexing.
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen """
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen # This returns 0 because this class is not responsible for
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen # storing anything in memory.
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen return 0
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen def shift_file(self, use_dir, suffix):
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen """Moves the existing file with self._name in directory
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen use_dir to a new file named self._name + suffix in directory
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen use_dir. If it has done this previously, it removes the old
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen file it moved. It also opens the newly moved file and uses
736b1800b0409ba7443d33ecb8d0fb9f8b091660Timo Sirainen that as the file for its file handle.
736b1800b0409ba7443d33ecb8d0fb9f8b091660Timo Sirainen """
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen assert self._file_handle is None
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen orig_path = os.path.join(use_dir, self._name)
a18503d5dc0751a1f9785e48438a219d95c0b9c2Timo Sirainen new_path = os.path.join(use_dir, self._name + suffix)
a18503d5dc0751a1f9785e48438a219d95c0b9c2Timo Sirainen portable.rename(orig_path, new_path)
a18503d5dc0751a1f9785e48438a219d95c0b9c2Timo Sirainen tmp_name = self._name
a18503d5dc0751a1f9785e48438a219d95c0b9c2Timo Sirainen self._name = self._name + suffix
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen self.open(use_dir)
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen self._name = tmp_name
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen if self._old_suffix is not None:
736b1800b0409ba7443d33ecb8d0fb9f8b091660Timo Sirainen os.remove(os.path.join(use_dir, self._old_suffix))
736b1800b0409ba7443d33ecb8d0fb9f8b091660Timo Sirainen self._old_suffix = self._name + suffix
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainenclass IndexStoreListDict(IndexStoreBase):
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen """Used when both a list and a dictionary are needed to
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen store the information. Used for bidirectional lookup when
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen one item is an int (an id) and the other is not (an entity). It
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen maintains a list of empty spots in the list so that adding entities
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen can take advantage of unused space. It encodes empty space as a blank
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen line in the file format and '' in the internal list.
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen """
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen def __init__(self, file_name, build_function=lambda x: x,
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen decode_function=lambda x: x):
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen IndexStoreBase.__init__(self, file_name)
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen self._list = []
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen self._dict = {}
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen self._next_id = 0
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen self._list_of_empties = []
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen self._decode_func = decode_function
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen self._build_func = build_function
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen self._line_cnt = 0
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen def add_entity(self, entity, is_empty):
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen """Adds an entity consistently to the list and dictionary
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen allowing bidirectional lookup.
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen """
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen assert (len(self._list) == self._next_id)
736b1800b0409ba7443d33ecb8d0fb9f8b091660Timo Sirainen if self._list_of_empties and not is_empty:
736b1800b0409ba7443d33ecb8d0fb9f8b091660Timo Sirainen use_id = self._list_of_empties.pop(0)
736b1800b0409ba7443d33ecb8d0fb9f8b091660Timo Sirainen assert use_id <= len(self._list)
736b1800b0409ba7443d33ecb8d0fb9f8b091660Timo Sirainen if use_id == len(self._list):
736b1800b0409ba7443d33ecb8d0fb9f8b091660Timo Sirainen self._list.append(entity)
736b1800b0409ba7443d33ecb8d0fb9f8b091660Timo Sirainen self._next_id += 1
736b1800b0409ba7443d33ecb8d0fb9f8b091660Timo Sirainen else:
736b1800b0409ba7443d33ecb8d0fb9f8b091660Timo Sirainen self._list[use_id] = entity
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen else:
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen use_id = self._next_id
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen self._list.append(entity)
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen self._next_id += 1
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen if not(is_empty):
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen self._dict[entity] = use_id
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen assert (len(self._list) == self._next_id)
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen return use_id
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen def remove_id(self, in_id):
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen """deletes in_id from the list and the dictionary """
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen entity = self._list[in_id]
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen self._list[in_id] = ""
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen self._dict[entity] = ""
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen def remove_entity(self, entity):
736b1800b0409ba7443d33ecb8d0fb9f8b091660Timo Sirainen """deletes the entity from the list and the dictionary """
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen in_id = self._dict[entity]
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen self._dict[entity] = ""
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen self._list[in_id] = ""
f6ae9ae80a1fcf6c8f45ab759f0074caaa66c9c8Timo Sirainen
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen def get_id(self, entity):
736b1800b0409ba7443d33ecb8d0fb9f8b091660Timo Sirainen """returns the id of entity """
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen return self._dict[entity]
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen def get_id_and_add(self, entity):
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen """Adds entity if it's not previously stored and returns the
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen id for entity.
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen """
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen # This code purposefully reimplements add_entity
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen # code. Replacing the function calls to has_entity, add_entity,
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen # and get_id with direct access to the data structure gave a
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen # speed up of a factor of 4. Because this is a very hot path,
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen # the tradeoff seemed appropriate.
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen if entity not in self._dict:
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen assert (len(self._list) == self._next_id)
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen if self._list_of_empties:
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen use_id = self._list_of_empties.pop(0)
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen assert use_id <= len(self._list)
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen if use_id == len(self._list):
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen self._list.append(entity)
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen self._next_id += 1
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen else:
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen self._list[use_id] = entity
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen else:
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen use_id = self._next_id
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen self._list.append(entity)
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen self._next_id += 1
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen self._dict[entity] = use_id
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen assert (len(self._list) == self._next_id)
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen return self._dict[entity]
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen def get_entity(self, in_id):
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen """return the entity in_id maps to """
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen return self._list[in_id]
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen def has_entity(self, entity):
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen """check if entity is in storage """
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen return entity in self._dict
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen def has_empty(self):
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen """Check if the structure has any empty elements which
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen can be filled with data.
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen """
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen return (len(self._list_of_empties) > 0)
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen def get_next_empty(self):
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen """returns the next id which maps to no element """
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen return self._list_of_empties.pop()
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen def write_dict_file(self, path, version_num):
4e8e7a93628b4ed60aaaa47c6f72c1433f21e81dTimo Sirainen """Passes self._list to the parent class to write to a file.
f6ae9ae80a1fcf6c8f45ab759f0074caaa66c9c8Timo Sirainen """
f6ae9ae80a1fcf6c8f45ab759f0074caaa66c9c8Timo Sirainen IndexStoreBase._protected_write_dict_file(self, path,
f6ae9ae80a1fcf6c8f45ab759f0074caaa66c9c8Timo Sirainen version_num, (self._decode_func(l) for l in self._list))
f6ae9ae80a1fcf6c8f45ab759f0074caaa66c9c8Timo Sirainen def read_dict_file(self):
f6ae9ae80a1fcf6c8f45ab759f0074caaa66c9c8Timo Sirainen """Reads in a dictionary previously stored using the above
f6ae9ae80a1fcf6c8f45ab759f0074caaa66c9c8Timo Sirainen call
f6ae9ae80a1fcf6c8f45ab759f0074caaa66c9c8Timo Sirainen """
f6ae9ae80a1fcf6c8f45ab759f0074caaa66c9c8Timo Sirainen assert self._file_handle
f6ae9ae80a1fcf6c8f45ab759f0074caaa66c9c8Timo Sirainen self._dict.clear()
f6ae9ae80a1fcf6c8f45ab759f0074caaa66c9c8Timo Sirainen self._list = []
f6ae9ae80a1fcf6c8f45ab759f0074caaa66c9c8Timo Sirainen for i, line in enumerate(self._file_handle):
f6ae9ae80a1fcf6c8f45ab759f0074caaa66c9c8Timo Sirainen # A blank line means that id can be reused.
f6ae9ae80a1fcf6c8f45ab759f0074caaa66c9c8Timo Sirainen tmp = self._build_func(line.rstrip("\n"))
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen if line == "\n":
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen self._list_of_empties.append(i)
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen else:
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen self._dict[tmp] = i
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen self._list.append(tmp)
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen self._line_cnt = i + 1
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen self._next_id = i + 1
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen IndexStoreBase.read_dict_file(self)
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen return self._line_cnt
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen def count_entries_removed_during_partial_indexing(self):
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen """Returns the number of entries removed during a second phase
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen of indexing.
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen """
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen return len(self._list)
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainenclass IndexStoreDict(IndexStoreBase):
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen """Class used when only entity -> id lookup is needed
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen """
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen def __init__(self, file_name):
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen IndexStoreBase.__init__(self, file_name)
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen self._dict = {}
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen self._next_id = 0
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen def get_dict(self):
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen return self._dict
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen def get_entity(self, in_id):
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen return self._dict[in_id]
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen def has_entity(self, entity):
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen return entity in self._dict
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen def read_dict_file(self):
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen """Reads in a dictionary stored in line number -> entity
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen format
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen """
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen self._dict.clear()
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen for line_cnt, line in enumerate(self._file_handle):
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen line = line.rstrip("\n")
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen self._dict[line_cnt] = line
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen IndexStoreBase.read_dict_file(self)
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen def count_entries_removed_during_partial_indexing(self):
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen """Returns the number of entries removed during a second phase
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen of indexing.
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen """
f784d5bb8edbec88829524135cfa100129f5384dTimo Sirainen return len(self._dict)
f6497ac81e6de57870936d538acccb75ce408fc1Timo Sirainen
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainenclass IndexStoreDictMutable(IndexStoreBase):
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen """Dictionary which allows dynamic update of its storage
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen """
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen def __init__(self, file_name):
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen IndexStoreBase.__init__(self, file_name)
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen self._dict = {}
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen def get_dict(self):
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen return self._dict
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen def has_entity(self, entity):
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen return entity in self._dict
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen def get_id(self, entity):
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen return self._dict[entity]
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen def get_keys(self):
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen return list(self._dict.keys())
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen @staticmethod
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen def __quote(str):
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen if " " in str:
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen return "1" + quote(str)
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen else:
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen return "0" + str
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen def read_dict_file(self):
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen """Reads in a dictionary stored in with an entity
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen and its number on each line.
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen """
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen self._dict.clear()
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen for line in self._file_handle:
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen token, offset = line.split(" ")
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen if token[0] == "1":
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen token = unquote(token[1:])
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen else:
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen token = token[1:]
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen offset = int(offset)
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen self._dict[token] = offset
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen IndexStoreBase.read_dict_file(self)
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen def open_out_file(self, use_dir, version_num):
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen """Opens the output file for this class and prepares it
f06cc4cb6542c49430ed96b1a1459a2952d820c3Timo Sirainen to be written via write_entity.
f06cc4cb6542c49430ed96b1a1459a2952d820c3Timo Sirainen """
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen self.write_dict_file(use_dir, version_num)
ae9691f7ef36d5272d72c90fa51393dfea5dd126Timo Sirainen self._file_handle = open(os.path.join(use_dir, self._name),
e0ba54c7f985fc403b41c6e36d6a7f44908b23f0Timo Sirainen 'ab', buffering=PKG_FILE_BUFSIZ)
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen def write_entity(self, entity, my_id):
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen """Writes the entity out to the file with my_id """
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen assert self._file_handle is not None
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen self._file_handle.write(self.__quote(str(entity)) + " " +
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen str(my_id) + "\n")
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen
f6497ac81e6de57870936d538acccb75ce408fc1Timo Sirainen def write_dict_file(self, path, version_num):
f6497ac81e6de57870936d538acccb75ce408fc1Timo Sirainen """ Generates an iterable list of string representations of
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen the dictionary that the parent's protected_write_dict_file
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen function can call.
f6497ac81e6de57870936d538acccb75ce408fc1Timo Sirainen """
f6497ac81e6de57870936d538acccb75ce408fc1Timo Sirainen IndexStoreBase._protected_write_dict_file(self, path,
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen version_num, [])
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen def count_entries_removed_during_partial_indexing(self):
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen """Returns the number of entries removed during a second phase
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen of indexing.
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen """
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen return 0
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainenclass IndexStoreSetHash(IndexStoreBase):
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen def __init__(self, file_name):
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen IndexStoreBase.__init__(self, file_name)
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen # In order to interoperate with older clients, we must use sha-1
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen # here.
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen self.hash_val = hashlib.sha1().hexdigest()
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen def set_hash(self, vals):
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen """Set the has value."""
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen self.hash_val = self.calc_hash(vals)
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen def calc_hash(self, vals):
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen """Calculate the hash value of the sorted members of vals."""
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen vl = list(vals)
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen vl.sort()
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen # In order to interoperate with older clients, we must use sha-1
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen # here.
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen shasum = hashlib.sha1()
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen for v in vl:
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen shasum.update(v)
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen return shasum.hexdigest()
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen def write_dict_file(self, path, version_num):
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen """Write self.hash_val out to a line in a file """
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen IndexStoreBase._protected_write_dict_file(self, path,
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen version_num, [self.hash_val])
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen
1a0ece3e873e3864269ed7eaed957dc10c56d25fTimo Sirainen def read_dict_file(self):
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen """Process a dictionary file written using the above method
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen """
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen sp = self._file_handle.tell()
202b4674243a4a4826c35ed4d089831985c47256Timo Sirainen res = 0
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen for res, line in enumerate(self._file_handle):
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen assert res < 1
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen self.hash_val = line.rstrip()
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen self._file_handle.seek(sp)
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen IndexStoreBase.read_dict_file(self)
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen return res
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen
2eccb2637d0153bb7f9ad39a70f254cece74342cTimo Sirainen def check_against_file(self, vals):
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen """Check the hash value of vals against the value stored
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen in the file for this object."""
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen if not self._have_read:
86bdb644d147a73df85abce4325254d694217a5fTimo Sirainen self.read_dict_file()
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen incoming_hash = self.calc_hash(vals)
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen if self.hash_val != incoming_hash:
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen raise search_errors.IncorrectIndexFileHash(
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen self.hash_val, incoming_hash)
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen def count_entries_removed_during_partial_indexing(self):
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen """Returns the number of entries removed during a second phase
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen of indexing."""
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen return 0
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainenclass IndexStoreSet(IndexStoreBase):
f06cc4cb6542c49430ed96b1a1459a2952d820c3Timo Sirainen """Used when only set membership is desired.
f06cc4cb6542c49430ed96b1a1459a2952d820c3Timo Sirainen This is currently designed for exclusive use
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen with storage of fmri.PkgFmris. However, that impact
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen is only seen in the read_and_discard_matching_from_argument
7877db7b5daad125b6cb3e015574f33871c9a51bTimo Sirainen method.
4ca2ccd50ad25260865f561e4ebf36899d7fe966Timo Sirainen """
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen def __init__(self, file_name):
4ca2ccd50ad25260865f561e4ebf36899d7fe966Timo Sirainen IndexStoreBase.__init__(self, file_name)
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen self._set = set()
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen def get_set(self):
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen return self._set
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen def clear(self):
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen self._set.clear()
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen def add_entity(self, entity):
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen self._set.add(entity)
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen def remove_entity(self, entity):
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen """Remove entity purposfully assumes that entity is
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen already in the set to be removed. This is useful for
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen error checking and debugging.
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen """
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen self._set.remove(entity)
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen def has_entity(self, entity):
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen return (entity in self._set)
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen def write_dict_file(self, path, version_num):
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen """Write each member of the set out to a line in a file """
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen IndexStoreBase._protected_write_dict_file(self, path,
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen version_num, self._set)
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen def read_dict_file(self):
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen """Process a dictionary file written using the above method
4ca2ccd50ad25260865f561e4ebf36899d7fe966Timo Sirainen """
4ca2ccd50ad25260865f561e4ebf36899d7fe966Timo Sirainen assert self._file_handle
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen res = 0
662bb64be818407c6719a69780411f7ca8d6c96fAki Tuomi self._set.clear()
662bb64be818407c6719a69780411f7ca8d6c96fAki Tuomi for i, line in enumerate(self._file_handle):
662bb64be818407c6719a69780411f7ca8d6c96fAki Tuomi line = line.rstrip("\n")
662bb64be818407c6719a69780411f7ca8d6c96fAki Tuomi assert i == len(self._set)
662bb64be818407c6719a69780411f7ca8d6c96fAki Tuomi self.add_entity(line)
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen res = i + 1
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen IndexStoreBase.read_dict_file(self)
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen return res
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen def read_and_discard_matching_from_argument(self, fmri_set):
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen """Reads the file and removes all frmis in the file
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen from fmri_set.
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen """
2ac5f36aa7c2e7a07ba8815d43a6d7483f62e74cTimo Sirainen if self._file_handle:
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen for line in self._file_handle:
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen f = fmri.PkgFmri(line)
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen fmri_set.discard(f)
bb869cc24b24a8df84a43154c628785d6aee784cTimo Sirainen
def count_entries_removed_during_partial_indexing(self):
"""Returns the number of entries removed during a second phase
of indexing."""
return len(self._set)
class InvertedDict(IndexStoreBase):
"""Class used to store and process fmri to offset mappings. It does
delta compression and deduplication of shared offset sets when writing
to a file."""
def __init__(self, file_name, p_id_trans):
"""file_name is the name of the file to write to or read from.
p_id_trans is an object which has a get entity method which,
when given a package id number returns the PkgFmri object
for that id number."""
IndexStoreBase.__init__(self, file_name)
self._p_id_trans = p_id_trans
self._dict = {}
self._fmri_offsets = {}
def __copy__(self):
return self.__class__(self._name, self._p_id_trans)
def add_pair(self, p_id, offset):
"""Adds a package id number and an associated offset to the
existing dictionary."""
try:
self._fmri_offsets[p_id].append(offset)
except KeyError:
self._fmri_offsets[p_id] = [offset]
def invert_id_to_offsets_dict(self):
"""Does delta encoding of offsets to reduce space by only
storing the difference between the current offset and the
previous offset. It also performs deduplication so that all
packages with the same set of offsets share a common bucket."""
inv = {}
for p_id in self._fmri_offsets.keys():
old_o = 0
bucket = []
for o in sorted(set(self._fmri_offsets[p_id])):
bucket.append(o - old_o)
old_o = o
h = " ".join([str(o) for o in bucket])
del self._fmri_offsets[p_id]
if h not in inv:
inv[h] = []
inv[h].append(p_id)
return inv
@staticmethod
def __make_line(offset_str, p_ids, trans):
"""For a given offset string, a list of package id numbers,
and a translator from package id numbers to PkgFmris, returns
the string which represents that information. Its format is
space separated package fmris, followed by a !, followed by
space separated offsets which have had delta compression
performed."""
return " ".join([
trans.get_entity(p_id).get_fmri(anarchy=True,
include_scheme=False)
for p_id in p_ids
]) + "!" + offset_str
def write_dict_file(self, path, version_num):
"""Write the mapping of package fmris to offset sets out
to the file."""
inv = self.invert_id_to_offsets_dict()
IndexStoreBase._protected_write_dict_file(self, path,
version_num, (
self.__make_line(o, inv[o], self._p_id_trans)
for o in inv
))
def read_dict_file(self):
"""Read a file written by the above function and store the
information in a dictionary."""
assert self._file_handle
for l in self._file_handle:
fmris, offs = l.split("!")
self._dict[fmris] = offs
IndexStoreBase.read_dict_file(self)
@staticmethod
def de_delta(offs):
"""For a list of strings of offsets, undo the delta compression
that has been performed."""
old_o = 0
ret = []
for o in offs:
o = int(o) + old_o
ret.append(o)
old_o = o
return ret
def get_offsets(self, match_func):
"""For a given function which returns true if it matches the
desired fmri, return the offsets which are associated with the
fmris which match."""
offs = []
for fmris in self._dict.keys():
for p in fmris.split():
if match_func(p):
offs.extend(self.de_delta(
self._dict[fmris].split()))
break
return set(offs)