search_storage.py revision 621
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
# need to add locks to the dictionary reading so that we don't have
# multiple threads loading in the dictionary at the same time
import os
import errno
import time
import sha
import urllib
FMRI_FILE = 'id_to_fmri_dict.ascii'
ACTION_FILE = 'id_to_action_dict.ascii'
TT_FILE = 'id_to_token_type_dict.ascii'
VERSION_FILE = 'id_to_version_dict.ascii'
KEYVAL_FILE = 'id_to_keyval_dict.ascii'
FULL_FMRI_FILE = 'full_fmri_list'
MAIN_FILE = 'main_dict.ascii.v1'
BYTE_OFFSET_FILE = 'token_byte_offset.v1'
FULL_FMRI_HASH_FILE = 'full_fmri_list.hash'
"""Opens all data holders in data_list and ensures that the
versions are consistent among all of them.
It retries several times in case a race condition between file
migration and open is encountered.
Note: Do not set timeout to be 0. It will cause an exception to be
immediately raised.
"""
missing = None
cur_version = None
# The assignments to cur_version and missing cannot be
# placed here. They must be reset prior to breaking out of the
# for loop so that the while loop condition will be true. They
# cannot be placed after the for loop since that path is taken
# when all files are missing or opened successfully.
for d in data_list:
# All indexes must have the same version and all must
# either be present or absent for a successful return.
# If one of these conditions is not met, the function
# tries again until it succeeds or the time spent in
# in the function is greater than timeout.
try:
# If we get here, then the current index file
# is present.
if missing == None:
elif missing:
missing = None
cur_version = None
break
d.set_file_handle(fh, f)
version_num = \
# Read the version. If this is the first file,
# set the expected version otherwise check that
# the version matches the expected version.
if cur_version == None:
elif not (cur_version == version_num):
# Got inconsistent versions, so close
# all files and try again.
for d in data_list:
missing = None
cur_version = None
break
except IOError, e:
# If the index file is missing, ensure
# that previous files were missing as
# well. If not, try again.
for d in data_list:
missing = None
cur_version = None
break
else:
for d in data_list:
raise
if missing:
assert cur_version == None
# The index is missing (ie, no files were present).
return None
else:
assert cur_version is not None
return cur_version
class IndexStoreBase(object):
"""Base class for all data storage used by the indexer and
queryEngine. All members must have a file name and maintain
an internal file handle to that file as instructed by external
calls.
"""
self._file_handle = None
self._file_path = None
def get_file_name(self):
if self._file_handle:
raise RuntimeError("setting an extant file handle, "
"must close first, fp is: " + f_path)
else:
def get_file_path(self):
return self._file_path
def close_file_handle(self):
"""Closes the file handle and clears it so that it cannot
be reused.
"""
if self._file_handle:
self._file_handle = None
self._file_path = None
"""Writes the dictionary in the expected format.
Note: Only child classes should call this method.
"""
version_string = "VERSION: "
def should_reread(self):
"""This method uses the modification time and the file size
to (heuristically) determine whether the file backing this
storage has changed since it was last read.
"""
return True
return False
"""This uses consistent open to ensure that the version line
processing is done consistently and that only a single function
actually opens files stored using this class.
"""
class IndexStoreMainDict(IndexStoreBase):
"""Class for representing the main dictionary file
"""
# Here is an example of a line from the main dictionary, it is
# explained below:
# %gconf.xml (5,3,65689 => 249,202) (5,3,65690 => 249,202)
# (5,3,65691 => 249,202) (5,3,65692 => 249,202)
#
# The main dictionary has a more complicated format. Each line begins
# with a search token (%gconf.xml) followed by a list of mappings. Each
# mapping takes a token_type, action, and keyvalue tuple ((5,3,65689),
# (5,3,65690), (5,3,65691), (5,3,65692)) to a list of pkg-stem, version
# pairs (249,202) in which the token is found in an action with
# token_type, action, and keyvalues matching the tuple. Further
# compaction is gained by storing everything but the token as an id
# which the other dictionaries can turn into human-readable content.
#
# In short, the definition of a main dictionary entry is:
# Note: "(", ")", and "=>" actually appear in the file
# "[", "]", and "+" are used to specify pattern
# token [(token_type_id, action_id, keyval_id => [pkg_stem_id,version_id ]+)]+
self._old_suffix = None
"""This class relies on external methods to write the file.
Making this empty call to protected_write_dict_file allows the
file to be set up correctly with the version number stored
correctly.
"""
version_num, [])
def get_file_handle(self):
"""Return the file handle. Note that doing
anything other than sequential reads or writes
to or from this file_handle may result in unexpected
behavior. In short, don't use seek.
"""
return self._file_handle
def parse_main_dict_line(line):
"""Parses one line of a main dictionary file.
Changes to this function must be paired with changes to
write_main_dict_line below.
"""
assert tok_end > 0
res = []
processed_fmris = []
"""Paired with parse_main_dict_line above. Writes
a line in a main dictionary file in the appropriate format.
"""
for k in dictionary.keys():
"""Returns the number of entries removed during a second phase
of indexing.
"""
# This returns 0 because this class is not responsible for
# storing anything in memory.
return 0
"""Moves the existing file with self._name in directory
use_dir to a new file named self._name + suffix in directory
use_dir. If it has done this previously, it removes the old
file it moved. It also opens the newly moved file and uses
that as the file for its file handle.
"""
assert self._file_handle is None
if self._old_suffix is not None:
class IndexStoreListDict(IndexStoreBase):
"""Used when both a list and a dictionary are needed to
store the information. Used for bidirectional lookup when
one item is an int (an id) and the other is not (an entity). It
maintains a list of empty spots in the list so that adding entities
can take advantage of unused space. It encodes empty space as a blank
line in the file format and '' in the internal list.
"""
self._list_of_empties = []
"""Adds an entity consistently to the list and dictionary
allowing bidirectional lookup.
"""
else:
else:
if not(is_empty):
return use_id
"""deletes in_id from the list and the dictionary """
"""deletes the entity from the list and the dictionary """
"""returns the id of entity """
"""Adds entity if it's not previously stored and returns the
id for entity.
"""
# This code purposefully reimplements add_entity
# code. Replacing the function calls to has_entity, add_entity,
# and get_id with direct access to the data structure gave a
# speed up of a factor of 4. Because this is a very hot path,
# the tradeoff seemed appropriate.
if self._list_of_empties:
else:
else:
"""return the entity in_id maps to """
"""check if entity is in storage """
"""Check if the structure has any empty elements which
can be filled with data.
"""
def get_next_empty(self):
"""returns the next id which maps to no element """
"""Passes self._list to the parent class to write to a file.
"""
def read_dict_file(self):
"""Reads in a dictionary previously stored using the above
call
"""
assert self._file_handle
if self.should_reread():
# A blank line means that id can be reused.
if line == '\n':
else:
if self._build_func:
"""Returns the number of entries removed during a second phase
of indexing.
"""
class IndexStoreDict(IndexStoreBase):
"""Class used when only entity -> id lookup is needed
"""
def read_dict_file(self):
"""Reads in a dictionary stored in line number -> entity
format
"""
if self.should_reread():
"""If it's necessary to reread the file, it rereads the
file. It matches the line it reads against the contents of
in_set. If a match is found, the entry on the line is stored
for later use, otherwise the line is skipped. When all items
in in_set have been matched, the method is done and returns.
"""
if self.should_reread():
match_cnt = 0
if i in in_set:
match_cnt += 1
break
"""Returns the number of entries removed during a second phase
of indexing.
"""
class IndexStoreDictMutable(IndexStoreBase):
"""Dictionary which allows dynamic update of its storage
"""
else:
return str[1:]
if " " in str:
else:
return "0" + str
def read_dict_file(self):
"""Reads in a dictionary stored in with an entity
and its number on each line.
"""
if self.should_reread():
"""Opens the output file for this class and prepares it
to be written via write_entity.
"""
'ab')
"""Writes the entity out to the file with my_id """
assert self._file_handle is not None
""" Generates an iterable list of string representations of
the dictionary that the parent's protected_write_dict_file
function can call.
"""
version_num, [])
"""Returns the number of entries removed during a second phase
of indexing.
"""
return 0
class IndexStoreSetHash(IndexStoreBase):
"""Set the has value."""
"""Calculate the hash value of the sorted members of vals."""
for v in vl:
"""Write self.hash_val out to a line in a file """
def read_dict_file(self):
"""Process a dictionary file written using the above method
"""
assert self._file_handle
res = 0
assert res < 1
if res > 0:
return res
"""Check the hash value of vals against the value stored
in the file for this object."""
"""Returns the number of entries removed during a second phase
of indexing."""
return 0
class IndexStoreSet(IndexStoreBase):
"""Used when only set membership is desired.
This is currently designed for exclusive use
with storage of fmri.PkgFmris. However, that impact
is only seen in the read_and_discard_matching_from_argument
method.
"""
"""Remove entity purposfully assumes that entity is
already in the set to be removed. This is useful for
error checking and debugging.
"""
"""Write each member of the set out to a line in a file """
def read_dict_file(self):
"""Process a dictionary file written using the above method
"""
assert self._file_handle
res = 0
if self.should_reread():
res = i + 1
return res
"""Reads the file and removes all frmis in the file
from fmri_set.
"""
if self._file_handle:
"""Returns the number of entries removed during a second phase
of indexing."""