#!/usr/bin/python
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2009, 2016, Oracle and/or its affiliates. All rights reserved.
#
from __future__ import print_function
import os
import fnmatch
import re
import six
import sys
import threading
import copy
import itertools
import errno
import ply.lex as lex
import ply.yacc as yacc
import pkg.search_storage as ss
import pkg.search_errors as search_errors
import pkg.fmri as fmri
import pkg.actions as actions
from pkg.choose import choose
from pkg.misc import EmptyI, force_str
FILE_OPEN_TIMEOUT_SECS = 1
MAX_TOKEN_COUNT = 100
class QueryLexer(object):
"""This class defines the lexer used to separate parse queries into
its constituents. It's written for Ply, a python implementation for
lex and yacc."""
# These are the types of tokens that the lexer can produce.
tokens = ("FTERM", "TERM", "LPAREN", "RPAREN", "AND", "OR", "QUOTE1",
"QUOTE2", "LBRACE", "RBRACE")
# These statements define the lexing rules or (, ),', and ". These are
# all checked after all lexing defined in functions below.
t_LPAREN = r"\("
t_RPAREN = r"\)"
t_QUOTE1 = r"[\']"
t_QUOTE2 = r'[\"]'
# This rule causes spaces to break tokens, but the space itself is not
# reported as a token.
t_ignore = r" "
def __init__(self):
object.__init__(self)
# Set up a dictionary of strings which have special meaning and
# are otherwise indistinguishable from normal terms. The
# mapping is from the string seen in the query to the lexer
# tokens.
self.reserved = {
"AND" : "AND",
"OR" : "OR"
}
# Note: Functions are documented using comments instead of docstrings
# below because Ply uses the docstrings for specific purposes.
def t_LBRACE(self, t):
# This rule is for lexing the left side of the pkgs<>
# construction.
r"([pP]([kK]([gG][sS]?)?)?)?<"
t.type = "LBRACE"
return t
def t_RBRACE(self, t):
# This rule is for lexing the right side of the pkgs<>
# construction.
r">"
t.type = "RBRACE"
return t
def t_FTERM(self, t):
# This rule handles valid search terms with a colon in them. If
# all colons are escaped, it produces a TERM token whose value
# is the original term with the escape characters removed. If
# there are unescaped colons, it produces an FTERM token whose
# value is a four tuple consisting of the pkg name, the action
# type, the action key, and the token that followed the last
# unescaped colon.
#
# The following regular expresion matches a string with a colon
# in it, subject to certain other restrictions, such as not
# beginning with a quote. It consists of three parts: the
# part before the colon, the colon, and the part after colon.
# The part before the colon is attempting to match anything that
# could come before a colon acting as a field deliminater or
# the escaped colon in a token. The colon is matching either a
# colon acting as a field separator or an escaped colon in a
# token or field. The part after the colon is attempting to
# match the valid term that can follow a colon that's either
# a field separator or escaped as part of a term.
r"([^\'\"\(\s][^\s]*)?\:([^\s\(\'\"][^\s]*[^\s\)\'\"\>]|[^\s\(\)\'\"\>])?"
fields = t.value.split(":")
assert len(fields) >= 2
tmp = fields[0:1]
for field in fields[1:]:
if tmp[-1] and tmp[-1][-1] == "\\":
tmp[-1] = tmp[-1][:-1] + ":" + field
else:
tmp.append(field)
fields = tmp
token = fields[-1]
# If the last item in the list is not the empty string, then
# it's possible that there was no field query and that all the
# colons were escaped. In that case, treat the item as a TERM
# rather than an FTERM.
if len(fields) == 1:
t.type = self.reserved.get(token, "TERM")
t.value = token
# If an unescaped colon was in the term, then this was actually
# a FTERM, so fill in the fields and set the type to FTERM.
else:
key = fields[-2]
action_type = ""
pkg_name = ""
if len(fields) >= 3:
action_type = fields[-3]
if len(fields) >= 4:
pkg_name = fields[-4]
t.type = "FTERM"
t.value = (pkg_name, action_type, key, token)
return t
def t_TERM(self, t):
# This rule handles the general search terms as well as
# checking for any reserved words such as AND or OR.
r'[^\s\(\'\"][^\s]*[^\s\)\'\"\>]|[^\s\(\)\'\"]'
t.type = self.reserved.get(t.value, "TERM")
return t
def t_error(self, t):
raise RuntimeError("\n".join(
[_("An unparseable character in query at position : {0:d}").format(
self.get_pos() + 1),
"{0}".format(self.get_string()),
"{0}".format(" " * self.get_pos() + "^")]))
def build(self, **kwargs):
self.lexer = lex.lex(object=self, **kwargs)
def set_input(self, input):
self.lexer.input(input)
def token(self):
return self.lexer.token()
def get_pos(self):
return self.lexer.lexpos
def get_string(self):
return self.lexer.lexdata
def test(self, data):
"""This is a function useful for testing and debugging as it
shows the user exactly which tokens are produced from the input
data."""
self.lexer.input(data)
while 1:
tok = self.lexer.token()
if not tok:
break
print(tok, file=sys.stderr)
class QueryParser(object):
"""This class defines the parser which converts a stream of tokens into
an abstract syntax tree representation of the query. The AST is able
to perform the search."""
# Use the same set of tokens as the lexer.
tokens = QueryLexer.tokens
# Define the precendence and associativity of certain tokens to
# eliminate shift/reduce conflicts in the grammar.
precedence = (
("right", "FTERM"),
("left", "TERM"),
("right", "AND", "OR"),
("right", "QUOTE1", "QUOTE2"),
)
# Note: Like the lexer, Ply uses the doctrings of the functions to
# determine the rules.
def p_top_level(self, p):
# This is the top or start node of the AST.
'''start : xterm'''
p[0] = self.query_objs["TopQuery"](p[1])
def p_xterm(self, p):
# This rule parses xterms. xterms are terms which can connect
# smaller terms together.
''' xterm : basetermlist
| andterm
| orterm'''
p[0] = p[1]
def p_basetermlist(self, p):
# basetermlist handles performing the implicit AND operator
# which is placed between a list of terms. For example the
# query 'foo bar' is treated the same as 'foo AND bar'.
''' basetermlist : baseterm
| baseterm basetermlist '''
if len(p) == 3:
p[0] = self.query_objs["AndQuery"](p[1], p[2])
else:
p[0] = p[1]
def p_baseterm(self, p):
# baseterms are the minimal units of meaning for the parser.
# Any baseterm is a valid query unto itself.
''' baseterm : term
| fterm
| phraseterm
| parenterm
| pkgconv'''
p[0] = p[1]
def p_term(self, p):
# terms are the parser's representation of the lexer's TERMs.
# The TermQuery object performs most of the work of actually
# performing the search.
'term : TERM'
p[0] = self.query_objs["TermQuery"](p[1])
def p_fterm(self, p):
# fterms are the parser's representation of the lexer's FTERMS
# (which are field/structured query terms). In the query
# 'foo:bar:baz:zap', foo, bar, and baz are part of the FTERM,
# zap is split into a separate lexer TERM token. zap flows
# up from parsing the ftermarg. A query like 'foo:bar:baz' has
# no ftermarg though. In that case, an implict wildcard is
# explicitly created.
'''fterm : FTERM ftermarg
| FTERM fterm
| FTERM'''
# If the len of p is 3, then one of the first two cases
# was used.
pkg_name, at, key, token = p[1]
fields = pkg_name, at, key
if len(p) == 3:
# If no token was attached to the FTERM, then attach
# the term found following it. If a token was attached
# to the FTERM then following term is treated like a
# basetermlist.
if token == "":
p[0] = self.query_objs["FieldQuery"](
fields, p[2])
else:
p[0] = self.query_objs["AndQuery"](
self.query_objs["FieldQuery"](
(pkg_name, at, key),
self.query_objs["TermQuery"](token)),
p[2])
# If the length of p isn't 3, then a bare FTERM was found. If
# no token was attached to the FTERM, it's necessary to make
# the implicit wildcard explicit.
else:
if token == "":
token = "*"
p[0] = self.query_objs["FieldQuery"](fields,
self.query_objs["TermQuery"](token))
def p_ftermarg(self, p):
# ftermargs are the terms which are valid after the final
# colon of a field term.
'''ftermarg : term
| phraseterm'''
p[0] = p[1]
def p_phraseterm(self, p):
# phraseterms are lists of terms enclosed by quotes.
''' phraseterm : QUOTE1 term_list QUOTE1
| QUOTE2 term_list QUOTE2'''
p[2].reverse()
p[0] = self.query_objs["PhraseQuery"](p[2],
self.query_objs["TermQuery"])
def p_term_list(self, p):
# term_lists consist of one or more space separated TERMs.
''' term_list : TERM term_list
| TERM '''
if len(p) == 3:
p[2].append(p[1])
p[0] = p[2]
else:
p[0] = [p[1]]
def p_parenterm(self, p):
# parenterms contain a single xterm surrounded by parens.
# The p[2] argument is simply passed on because the only
# role of parens is to perform grouping, which is enforced
# by the structure of the AST.
''' parenterm : LPAREN xterm RPAREN '''
p[0] = p[2]
def p_packages(self, p):
# pkgconv represents the pkgs<> term.
'pkgconv : LBRACE xterm RBRACE'
p[0] = self.query_objs["PkgConversion"](p[2])
def p_andterm(self, p):
# andterms perform an intersection of the results of its
# two children.
''' andterm : xterm AND xterm'''
p[0] = self.query_objs["AndQuery"](p[1], p[3])
def p_orterm(self, p):
''' orterm : xterm OR xterm'''
# orterms returns the union of the results of its
# two children.
p[0] = self.query_objs["OrQuery"](p[1], p[3])
def p_error(self, p):
raise ParseError(p, self.lexer.get_pos(),
self.lexer.get_string())
def __init__(self, lexer):
"""Build a parser using the lexer given as an argument."""
self.lexer = lexer
self.parser = yacc.yacc(module=self, debug=0, write_tables=0)
# Store the classes used to build the AST so that child classes
# can replace them where needed with alternate classes.
self.query_objs = {
"AndQuery" : AndQuery,
"FieldQuery" : FieldQuery,
"OrQuery" : OrQuery,
"PhraseQuery" : PhraseQuery,
"PkgConversion" : PkgConversion,
"TermQuery" : TermQuery,
"TopQuery" : TopQuery
}
def parse(self, input):
"""Parse the string, input, into an AST using the rules defined
in this class."""
self.lexer.set_input(input)
return self.parser.parse(lexer=self.lexer)
class QueryException(Exception):
pass
class QueryLengthExceeded(QueryException):
def __init__(self, token_cnt):
QueryException.__init__(self)
self.token_cnt = token_cnt
def __str__(self):
return _("The number of terms in the query is {len:d}, "
"which exceeds the maximum supported "
"value of {maxt:d} terms.").format(len=self.token_cnt,
maxt=MAX_TOKEN_COUNT)
class DetailedValueError(QueryException):
def __init__(self, name, bad_value, whole_query):
QueryException.__init__(self)
self.name = name
self.bad_value = bad_value
self.query = whole_query
def __str__(self):
return _("In query {query}, {name} had a bad value of "
"'{bv}'.").format(
query=self.query,
name=self.name,
bv=self.bad_value
)
class IncompleteQuery(QueryException):
def __init__(self, whole_query):
QueryException.__init__(self)
self.query = whole_query
def __str__(self):
return _("A query is expected to have five fields: "
"case sensitivity, return type, number of results to "
"return, the number at which to start returning results, "
"and the text of the query. The query provided lacked at "
"least one of those fields:\n{0}").format(self.query)
class ParseError(QueryException):
def __init__(self, parse_object, string_position, input_string):
QueryException.__init__(self)
self.p = parse_object
self.pos = string_position
self.str = input_string
def __str__(self):
# BUI will interpret a line starting with a \t as pre-formatted
# and put it in <pre> tags.
return "\n".join([_("Could not parse query."),
_("Problem occurred with: {0}\t").format(self.p),
"\t{0}".format(self.str),
"\t{0}".format(" " * max(self.pos - 1, 0) + "^")])
class Query(object):
"""General Query object. It defines various constants and provides for
marshalling a Query into and out of a string format."""
RETURN_EITHER = 0
RETURN_PACKAGES = 1
RETURN_ACTIONS = 2
VALIDATION_STRING = { 1:'Return from search v1\n' }
def __init__(self, text, case_sensitive, return_type, num_to_return,
start_point):
"""Construct a query object.
The "text" parameter is the tokens and syntax of the query.
The "case_sensitive" parameter is a boolean which determines
whether the query is case sensitive or not.
The "return_type" parameter must be either RETURN_PACKAGES or
RETURN_ACTIONS and determines whether the query is expected to
return packages or actions to the querier.
The "num_to_return" paramter is the maximum number of results to
return.
The "start_point" parameter is the number of results to skip
before returning results to the querier."""
token_cnt = len(text.split(" "))
if token_cnt > MAX_TOKEN_COUNT:
raise QueryLengthExceeded(token_cnt)
self.text = text
self.case_sensitive = case_sensitive
self.return_type = return_type
assert self.return_type == Query.RETURN_PACKAGES or \
self.return_type == Query.RETURN_ACTIONS
self.num_to_return = num_to_return
self.start_point = start_point
def __str__(self):
"""Return the v1 string representation of this query."""
return "{0}_{1}_{2}_{3}_{4}".format(self.case_sensitive,
self.return_type, self.num_to_return, self.start_point,
self.text)
@staticmethod
def fromstr(s):
"""Take the output of the __str__ method of this class and
return a Query object from that string."""
try:
case_sensitive, return_type, num_to_return, \
start_point, text = s.split("_", 4)
except ValueError:
raise IncompleteQuery(s)
if case_sensitive == 'True':
case_sensitive = True
elif case_sensitive == 'False':
case_sensitive = False
else:
raise DetailedValueError("case_sensitive",
case_sensitive, s)
if num_to_return == 'None':
num_to_return = None
else:
try:
num_to_return = int(num_to_return)
except ValueError:
raise DetailedValueError("num_to_return",
num_to_return, s)
if start_point == 'None':
start_point = None
else:
try:
start_point = int(start_point)
except ValueError:
raise DetailedValueError("start_point",
start_point, s)
try:
return_type = int(return_type)
except ValueError:
raise DetailedValueError("return_type", return_type, s)
if return_type != Query.RETURN_PACKAGES and \
return_type != Query.RETURN_ACTIONS:
raise DetailedValueError("return_type", return_type, s)
return Query(text, case_sensitive, return_type,
num_to_return, start_point)
@staticmethod
def return_action_to_key(k):
"""Method which produces the sort key for an action."""
at, st, pfmri, fv, l = k
return pfmri
class BooleanQueryException(QueryException):
"""This exception is used when the two children of a boolean query
don't agree on whether to return actions or packages."""
def __init__(self, ac, pc):
"""The parameter "ac" is the child which returned actions
while "pc" is the child which returned packages."""
QueryException.__init__(self)
self.ac = ac
self.pc = pc
def __str__(self):
# BUI will interpret a line starting with a \t as pre-formatted
# and put it in <pre> tags.
ac_s = _("This expression produces action results:")
ac_q = "\t{0}".format(self.ac)
pc_s = _("This expression produces package results:")
pc_q = "\t{0}".format(self.pc)
return "\n".join([ac_s, ac_q, pc_s, pc_q,
_("'AND' and 'OR' require those expressions to produce "
"the same type of results.")])
class BooleanQuery(object):
"""Superclass for all boolean operations in the AST."""
def __init__(self, left_query, right_query):
"""The parameters "left_query" and "right_query" are objects
which implement the query interface. Specifically, they're
expected to implement search, allow_version, set_info, and to
have a public member called return_type."""
self.lc = left_query
self.rc = right_query
self.return_type = self.lc.return_type
self.__check_return_types()
def __check_return_types(self):
if self.lc.return_type != self.rc.return_type:
if self.lc.return_type == Query.RETURN_ACTIONS:
raise BooleanQueryException(self.lc, self.rc)
else:
raise BooleanQueryException(self.rc, self.lc)
def add_field_restrictions(self, *params):
self.lc.add_field_restrictions(*params)
self.rc.add_field_restrictions(*params)
def set_info(self, **kwargs):
"""This function passes information to the terms prior to
search being executed. For a boolean query, it only needs to
pass whatever information exists onto its children."""
self.lc.set_info(**kwargs)
self.rc.set_info(**kwargs)
def search(self, *args):
"""Distributes the search to the two children and returns a
tuple of the results."""
return set(self.lc.search(None, *args)), \
set(self.rc.search(None, *args))
def sorted(self, res):
"""Sort the results. If the results are actions, sort by the
fmris of the packages from which they came."""
key = None
if self.return_type == Query.RETURN_ACTIONS:
key = Query.return_action_to_key
return sorted(res, key=key)
def allow_version(self, v):
"""Returns whether the query supports version v."""
return v > 0 and self.lc.allow_version(v) and \
self.rc.allow_version(v)
def propagate_pkg_return(self):
"""Makes each child return packages instead of actions.
If a child returns a value that isn't None, that means a new
node in the tree has been created which needs to become the
new child for this node."""
self.return_type = Query.RETURN_PACKAGES
new_lc = self.lc.propagate_pkg_return()
if new_lc:
self.lc = new_lc
new_rc = self.rc.propagate_pkg_return()
if new_rc:
self.rc = new_rc
self.__check_return_types()
return None
class AndQuery(BooleanQuery):
"""Class representing AND queries in the AST."""
def search(self, restriction, *args):
"""Performs a search over the two children and combines
the results.
The "restriction" parameter is a generator of actions, over
which the search shall be performed. Only boolean queries will
set restriction. Nodes that return packages have, by
definition, parents that must return packages. This means that
only queries contained within a boolean query higher up in the
AST tree will have restriction set."""
if self.return_type == Query.RETURN_ACTIONS:
# If actions are being returned, the answers from
# previous terms must be used as the domain of search.
# To do this, restriction is passed to the left
# child and the result from that child is passed to
# the right child as its domain.
lc_it = self.lc.search(restriction, *args)
return self.rc.search(lc_it, *args)
else:
# If packages are being returned, holding the names
# of all known packages in memory is feasible. By
# using sets, and their intersection, duplicates are
# also removed from the results.
lc_set, rc_set = BooleanQuery.search(self, *args)
return self.sorted(lc_set & rc_set)
def __str__(self):
return "({0!s} AND {1!s})".format(self.lc, self.rc)
def __repr__(self):
return "({0!r} AND {1!r})".format(self.lc, self.rc)
class OrQuery(BooleanQuery):
"""Class representing OR queries in the AST."""
def search(self, restriction, *args):
"""Performs a search over the two children and combines
the results.
The "restriction" parameter is a generator function that returns
actions within the search domain. If it's not None,
then this query is under a higher boolean query which also
returns actions."""
if self.return_type == Query.RETURN_PACKAGES:
# If packages are being returned, it's feasible to
# hold them all in memory. This allows the use of
# sets, and their union, to remove duplicates and
# produce a sorted list.
lc_set, rc_set = BooleanQuery.search(self, *args)
for i in self.sorted(lc_set | rc_set):
yield i
elif not restriction:
# If restriction doesn't exist, then chain together
# the results from both children.
for i in itertools.chain(self.lc.search(restriction,
*args), self.rc.search(restriction, *args)):
yield i
else:
# If restriction exists, then it must serve as the
# domain for the children. It is a generator so
# only one pass may be made over it. Also, it is not
# possible, in general, to hold a list of all items
# that the generator will produce in memory. These
# reasons lead to the construction below, which iterates
# over the results in restriction and uses each as the
# restriction to the child search. If this turns out
# to be a performance bottleneck, it would be possible
# to gather N of the results in restriction into a list
# and dispatch them to the children results in one shot.
# The tradeoff is the memory to hold O(N) results in
# memory at each level of ORs in the AST.
for i in restriction:
for j in itertools.chain(self.lc.search([i],
*args), self.rc.search([i], *args)):
yield j
def __str__(self):
return "({0!s} OR {1!s})".format(self.lc, self.rc)
def __repr__(self):
return "({0!r} OR {1!r})".format(self.lc, self.rc)
class PkgConversion(object):
"""Class representing a change from returning actions to returning
packages in the AST."""
def __init__(self, query):
self.query = query
self.return_type = Query.RETURN_PACKAGES
def __str__(self):
return "p<{0!s}>".format(self.query)
def __repr__(self):
return "p<{0!r}>".format(self.query)
def set_info(self, **kwargs):
"""This function passes information to the terms prior to
search being executed. It only needs to pass whatever
information exists to its child."""
self.query.set_info(**kwargs)
@staticmethod
def optional_action_to_package(it, return_type, current_type):
"""Based on the return_type and current type, it converts the
iterator over results, it, to a sorted list of packages.
return_type is what the caller wants to return and current_type
is what it is iterating over."""
if current_type == return_type or return_type == \
Query.RETURN_EITHER:
return it
elif return_type == Query.RETURN_PACKAGES and \
current_type == Query.RETURN_ACTIONS:
return sorted(set(
(pfmri for at, st, pfmri, fv, l in it)))
else:
assert 0
def search(self, restriction, *args):
"""Takes the results of its child's search and converts the
results to be a sorted list of packages.
The "restriction" parameter is a generator of actions which
the domain over which search should be performed. It should
always be None."""
return self.optional_action_to_package(
self.query.search(restriction, *args),
Query.RETURN_PACKAGES, self.query.return_type)
def allow_version(self, v):
"""Returns whether the query supports a query of version v."""
return v > 0 and self.query.allow_version(v)
def propagate_pkg_return(self):
"""Makes this node return packages instead of actions.
Returns None because no changes need to be made to the tree."""
return None
class PhraseQuery(object):
"""Class representing a phrase search in the AST"""
def __init__(self, str_list, term_query_class):
"""The "str_list" parameter is the list of strings which make
up the phrase to be searched.
The "term_query_class" parameter is a TermQuery object which
handles searching for the initial word in the phrase."""
assert str_list
self.query = term_query_class(str_list[0])
self.full_str = " ".join(str_list)
self.compare_str = self.full_str
self.return_type = Query.RETURN_ACTIONS
if len(str_list) > 1:
self.query.add_trailing_wildcard()
self._case_sensitive = None
@property
def pkg_name(self):
return self.query.pkg_name
@property
def action_type(self):
return self.query.action_type
@property
def key(self):
return self.query.key
def __repr__(self):
return "Phrase Query:'" + self.full_str + "'"
def __str__(self):
return "{0}:'{1}'".format(self.query.field_strings(),
self.full_str)
def add_field_restrictions(self, *params):
self.query.add_field_restrictions(*params)
def set_info(self, case_sensitive, **kwargs):
"""This function passes information to the terms prior to
search being executed. It only needs to pass whatever
information exists to its child."""
self._case_sensitive = case_sensitive
if not case_sensitive:
self.compare_str = self.full_str.lower()
self.query.set_info(case_sensitive=case_sensitive, **kwargs)
def filter_res(self, l):
"""Check to see if the phrase is contained in l, the string of
the original action."""
if self._case_sensitive:
return self.compare_str in l
return self.compare_str in l.lower()
@staticmethod
def combine(fs, fv, at, case_sensitive):
"""Checks to see if the phrase being searched for is a subtring
of the value which matched the token. If it is, use the value
returned, otherwise use the search phrase."""
if at != "set" or fs in fv or \
(not case_sensitive and fs in fv.lower()):
return fv
else:
return fs
def search(self, restriction, *args):
"""Perform a search for the given phrase. The child is used to
find instances of the first word of the phrase. Those results
are then filtered by searching for the longer phrase within
the original line of the action. restriction is a generator
function that returns actions within the search domain."""
it = (
(at, st, pfmri, self.combine(self.compare_str, fv, at,
self._case_sensitive), force_str(l))
for at, st, pfmri, fv, l
in self.query.search(restriction, *args)
if self.filter_res(force_str(l))
)
return it
def allow_version(self, v):
"""Returns whether the query supports a query of version v."""
return v > 0 and self.query.allow_version(v)
def propagate_pkg_return(self):
"""Inserts a conversion to package results into the tree.
Creates a new node by wrapping a PkgConversion node around
itself. It then returns the new node to its parent for
insertion into the tree."""
return PkgConversion(self)
class FieldQuery(object):
"""Class representing a structured query in the AST."""
def __init__(self, params, query):
"""Builds a FieldQuery object.
The "params" parameter are the three parts of the structured
search term pulled apart during parsing.
The "query" parameter is the Query object which contains the
fourth field (the token) of the structured search."""
# For efficiency, especially on queries which search over
# '*', instead of filtering the results, this class makes
# modifications to its child class so that it will do the
# needed filtering as it does the search.
self.query = query
self.return_type = Query.RETURN_ACTIONS
self.query.add_field_restrictions(*params)
def __repr__(self):
return "( PN:{0!r} AT:{1!r} ST:{2!r} Q:{3!r})".format(
self.query.pkg_name, self.query.action_type, self.query.key,
self.query)
def __str__(self):
return str(self.query)
def set_info(self, **kwargs):
"""This function passes information to the terms prior to
search being executed. It only needs to pass whatever
information exists to its child."""
self.query.set_info(**kwargs)
def search(self, restriction, *args):
"""Perform a search for the structured query. The child has
been modified so that it is able to do the structured query
directly."""
assert self.query.return_type == Query.RETURN_ACTIONS
return self.query.search(restriction, *args)
def allow_version(self, v):
"""Returns whether the query supports a query of version v."""
return v > 0 and self.query.allow_version(v)
def propagate_pkg_return(self):
"""Inserts a conversion to package results into the tree.
Creates a new node by wrapping a PkgConversion node around
itself. It then returns the new node to its parent for
insertion into the tree."""
return PkgConversion(self)
class TopQuery(object):
"""Class which must be at the top of all valid ASTs, and may only be
at the top of an AST. It handles starting N results in, or only
showing M items. It also transforms the internal representations of
results into the format expected by the callers of search."""
def __init__(self, query):
self.query = query
self.start_point = 0
self.num_to_return = None
def __repr__(self):
return "TopQuery({0!r})".format(self.query)
def __str__(self):
return str(self.query)
def __keep(self, x):
"""Determines whether the x'th result should be returned."""
return x >= self.start_point and \
(self.num_to_return is None or
x < self.num_to_return + self.start_point)
def finalize_results(self, it):
"""Converts the internal result representation to the format
which is expected by the callers of search. It also handles
returning only those results requested by the user."""
# Need to replace "1" with current search version, or something
# similar
if self.query.return_type == Query.RETURN_ACTIONS:
return (
(1, Query.RETURN_ACTIONS,
(fmri.PkgFmri(pfmri), fv, force_str(l)))
for x, (at, st, pfmri, fv, l)
in enumerate(it)
if self.__keep(x)
)
else:
return (
(1, Query.RETURN_PACKAGES, fmri.PkgFmri(pfmri))
for x, pfmri
in enumerate(it)
if self.__keep(x)
)
def set_info(self, num_to_return, start_point, **kwargs):
"""This function passes information to the terms prior to
search being executed. This is also where the starting point
and number of results to return is set. Both "num_to_return"
and "start_point" are expected to be integers."""
if start_point:
self.start_point = start_point
self.num_to_return = num_to_return
self.query.set_info(start_point=start_point,
num_to_return=num_to_return, **kwargs)
def search(self, *args):
"""Perform search by taking the result of the child's search
and transforming and subselecting the results. None is passed
to the child since initially there is no set of results to
restrict subsequent searches to."""
return self.finalize_results(self.query.search(None, *args))
def allow_version(self, v):
"""Returns whether the query supports a query of version v."""
return self.query.allow_version(v)
def propagate_pkg_return(self):
"""Makes the child return packages instead of actions.
If a child returns a value that isn't None, that means a new
node in the tree has been created which needs to become the
new child for this node."""
new_child = self.query.propagate_pkg_return()
if new_child:
self.query = new_child
return None
class TermQuery(object):
"""Class representing the a single query term in the AST. This is an
abstract class and should not be used instead of the related client and
server classes."""
# This structure was used to gather all index files into one
# location. If a new index structure is needed, the files can
# be added (or removed) from here. Providing a list or
# dictionary allows an easy approach to opening or closing all
# index files.
__dict_locks = {}
has_non_wildcard_character = re.compile('.*[^\*\?].*')
fmris = None
def __init__(self, term):
"""term is a the string for the token to be searched for."""
term = term.strip()
self._glob = False
if '*' in term or '?' in term or '[' in term:
self._glob = True
self._term = term
self.return_type = Query.RETURN_ACTIONS
self._file_timeout_secs = FILE_OPEN_TIMEOUT_SECS
# This block of options is used by FieldQuery to limit the
# domain of search.
self.pkg_name = None
self.action_type = None
self.key = None
self.action_type_wildcard = True
self.key_wildcard = True
self.pkg_name_wildcard = True
self.pkg_name_match = None
self._case_sensitive = None
self._dir_path = None
# These variables are set by set_info and are used to hold
# information specific to the particular search index that this
# AST is being built for.
self._manifest_path_func = None
self._data_manf = None
self._data_token_offset = None
self._data_main_dict = None
def __init_gdd(self, path):
gdd = self._global_data_dict
if path in gdd:
return
# Setup default global dictionary for this index path.
gdd[path] = {
"manf": ss.IndexStoreDict(ss.MANIFEST_LIST),
"token_byte_offset": ss.IndexStoreDictMutable(
ss.BYTE_OFFSET_FILE),
"fmri_offsets": ss.InvertedDict(ss.FMRI_OFFSETS_FILE, None)
}
@classmethod
def __lock_gdd(cls, index_dir):
# This lock is used so that only one instance of a term query
# object is ever modifying the class wide variable for this
# index.
cls.__dict_locks.setdefault(index_dir,
threading.Lock()).acquire()
@classmethod
def __unlock_gdd(cls, index_dir):
cls.__dict_locks[index_dir].release()
@classmethod
def _get_gdd(cls, path):
return cls._global_data_dict[path]
def __repr__(self):
return "( TermQuery: " + self._term + " )"
def __str__(self):
return "{0}:{1}".format(self.field_strings(),
self.__wc_to_string(False, self._term))
def field_strings(self):
return ":".join([
self.__wc_to_string(wc, v)
for wc, v in [(self.pkg_name_wildcard, self.pkg_name),
(self.action_type_wildcard, self.action_type),
(self.key_wildcard, self.key)
]
])
def propagate_pkg_return(self):
"""Inserts a conversion to package results into the tree.
Creates a new node by wrapping a PkgConversion node around
itself. It then returns the new node to its parent for
insertion into the tree."""
return PkgConversion(self)
@staticmethod
def __wc_to_string(wc, v):
if wc:
return ""
return "\\:".join(v.split(":"))
@classmethod
def clear_cache(cls, index_dir):
"""Dump any cached index data for specified index path."""
gdd = cls._global_data_dict
cls.__lock_gdd(index_dir)
try:
del gdd[index_dir]
except KeyError:
pass
finally:
cls.__unlock_gdd(index_dir)
def add_field_restrictions(self, pkg_name, action_type, key):
"""Add the information needed to restrict the search domain
to the specified fields."""
self.pkg_name = pkg_name
self.action_type = action_type
self.key = key
self.action_type_wildcard = \
self.__is_wildcard(self.action_type)
self.key_wildcard = self.__is_wildcard(self.key)
self.pkg_name_wildcard = self.__is_wildcard(self.pkg_name)
# Because users will rarely want to search on package names
# by fully specifying them out to their time stamps, package
# name search is treated as a prefix match. To accomplish
# this, a star is appended to the package name if it doesn't
# already end in one.
if not self.pkg_name_wildcard:
if not self.pkg_name.endswith("*"):
self.pkg_name += "*"
self.pkg_name_match = \
re.compile(fnmatch.translate(self.pkg_name),
re.I).match
@staticmethod
def __is_wildcard(s):
return s == '*' or s == ''
def add_trailing_wildcard(self):
"""Ensures that the search is a prefix match. Primarily used
by the PhraseQuery class."""
if not self._term.endswith('*'):
self._term += "*"
def set_info(self, index_dir, get_manifest_path,
case_sensitive, **kwargs):
"""Sets the information needed to search which is specific to
the particular index used to back the search.
'index_dir' is a path to the base directory of the index.
'get_manifest_path' is a function which when given a
fully specified fmri returns the path to the manifest file
for that fmri.
'case_sensitive' is a boolean which determines whether search
is case sensitive or not."""
self._dir_path = index_dir
assert self._dir_path
self._manifest_path_func = get_manifest_path
self._case_sensitive = case_sensitive
self.__init_gdd(self._dir_path)
# Take the static class lock because it's possible we'll
# modify the shared dictionaries for the class.
self.__lock_gdd(self._dir_path)
gdd = self._global_data_dict
tq_gdd = self._get_gdd(self._dir_path)
try:
self._data_main_dict = \
ss.IndexStoreMainDict(ss.MAIN_FILE)
if "fmri_offsets" not in tq_gdd:
tq_gdd["fmri_offsets"] = ss.InvertedDict(
ss.FMRI_OFFSETS_FILE, None)
# Create a temporary list of dictionaries we need to
# open consistently.
tmp = list(tq_gdd.values())
tmp.append(self._data_main_dict)
try:
# Try to open the index files assuming they
# were made after the conversion to using the
# fmri_offsets.v1 file.
ret = ss.consistent_open(tmp, self._dir_path,
self._file_timeout_secs)
except search_errors.InconsistentIndexException:
# If opening the index fails, try falling
# back to the index prior to the conversion
# to using the fmri_offsets.v1 file.
del tq_gdd["fmri_offsets"]
tmp = list(tq_gdd.values())
tmp.append(self._data_main_dict)
ret = ss.consistent_open(tmp, self._dir_path,
self._file_timeout_secs)
if ret == None:
raise search_errors.NoIndexException(
self._dir_path)
should_reread = False
# Check to see if any of the in-memory stores of the
# dictionaries are out of date compared to the ones
# on disc.
for k, d in tq_gdd.items():
if d.should_reread():
should_reread = True
break
try:
# If any of the in-memory dictionaries are out
# of date, create new copies of the shared
# dictionaries and make the shared structure
# point at them. This is to prevent changing
# the data structures in any other threads
# which may be part way through executing a
# search.
if should_reread:
for i in tmp:
i.close_file_handle()
tq_gdd = gdd[self._dir_path] = \
dict([
(k, copy.copy(d))
for k, d
in tq_gdd.items()
])
tmp = list(tq_gdd.values())
tmp.append(self._data_main_dict)
ret = ss.consistent_open(tmp,
self._dir_path,
self._file_timeout_secs)
try:
if ret == None:
raise search_errors.NoIndexException(
self._dir_path)
# Reread the dictionaries and
# store the new information in
# the shared data structure.
for d in tq_gdd.values():
d.read_dict_file()
except:
self._data_main_dict.close_file_handle()
raise
finally:
# Ensure that the files are closed no matter
# what happens.
for d in tq_gdd.values():
d.close_file_handle()
self._data_manf = tq_gdd["manf"]
self._data_token_offset = tq_gdd["token_byte_offset"]
self._data_fmri_offsets = tq_gdd.get("fmri_offsets",
None)
finally:
self.__unlock_gdd(self._dir_path)
def allow_version(self, v):
"""Returns whether the query supports a query of version v."""
return True
def _close_dicts(self):
"""Closes the main dictionary file handle, which is handled
separately from the other dictionaries since it's not read
entirely into memory in one shot."""
self._data_main_dict.close_file_handle()
@staticmethod
def flatten(lst):
"""Takes a list which may contain one or more sublists and
returns a list which contains all the items contained in the
original list and its sublists, but without any sublists."""
res = []
for l in lst:
if isinstance(l, list):
res.extend(TermQuery.flatten(l))
else:
res.append(l)
return res
def _restricted_search_internal(self, restriction):
"""Searches for the given term within a restricted domain of
search results. restriction is a generator function that
returns actions within the search domain."""
glob = self._glob
term = self._term
case_sensitive = self._case_sensitive
if not case_sensitive:
glob = True
for at, st, fmri_str, fv, l in restriction:
# Check if the current action doesn't match any field
# query specifications.
if not (self.pkg_name_wildcard or
self.pkg_name_match(fmri_str)) or \
not (self.action_type_wildcard or
at == self.action_type) or \
not (self.key_wildcard or st == self.key):
continue
l = l.strip()
# Find the possible tokens for this action.
act = actions.fromstr(l)
toks = [t[2] for t in act.generate_indices()]
toks = self.flatten(toks)
matches = []
if glob:
matches = choose(toks, term, case_sensitive)
elif term in toks:
matches = True
# If this search term matches any of the tokens the
# action could generate, yield it. If not, continue
# on to the next action.
if matches:
yield at, st, fmri_str, fv, l
def __offset_line_read(self, offsets):
"""Takes a group of byte offsets into the main dictionary and
reads the lines starting at those byte offsets."""
md_fh = self._data_main_dict.get_file_handle()
for o in sorted(offsets):
md_fh.seek(o)
yield md_fh.readline()
def _read_pkg_dirs(self, fmris):
"""Legacy function used to search indexes which have a pkg
directory with fmri offset information instead of the
fmri_offsets.v1 file."""
if not os.path.isdir(os.path.join(self._dir_path, "pkg")):
raise search_errors.InconsistentIndexException(
self._dir_path)
if TermQuery.fmris is None and not self.pkg_name_wildcard:
TermQuery.fmris = list(fmris())
pkg_offsets = set()
for matching_pfmri in (
p
for p in TermQuery.fmris
if self.pkg_name_match(
p.get_fmri(anarchy=True, include_scheme=False))
):
try:
fh = open(os.path.join(
self._dir_path, "pkg",
matching_pfmri.get_name(),
str(matching_pfmri.version)), "rb")
except EnvironmentError as e:
if e.errno != errno.ENOENT:
raise
continue
for l in fh:
pkg_offsets.add(int(l))
return pkg_offsets
def _search_internal(self, fmris):
"""Searches the indexes in dir_path for any matches of query
and the results in self.res. The method assumes the
dictionaries have already been loaded and read appropriately.
The "fmris" parameter is a generator of fmris of installed
packages."""
assert self._data_main_dict.get_file_handle() is not None
glob = self._glob
term = self._term
case_sensitive = self._case_sensitive
if not case_sensitive:
glob = True
# If offsets is equal to None, match all possible results. A
# match with no results is represented by an empty set.
offsets = None
if glob:
# If the term has at least one non-wildcard character
# in it, do the glob search.
if TermQuery.has_non_wildcard_character.match(term):
keys = self._data_token_offset.get_keys()
matches = choose(keys, term, case_sensitive)
offsets = set([
self._data_token_offset.get_id(match)
for match in matches
])
elif self._data_token_offset.has_entity(term):
offsets = set([
self._data_token_offset.get_id(term)])
else:
# Close the dictionaries since there are
# no more results to yield.
self._close_dicts()
return
# Restrict results by package name.
if not self.pkg_name_wildcard:
try:
pkg_offsets = \
self._data_fmri_offsets.get_offsets(
self.pkg_name_match)
except AttributeError:
pkg_offsets = self._read_pkg_dirs(fmris)
if offsets is None:
offsets = pkg_offsets
else:
offsets &= pkg_offsets
# Restrict results by action type.
if not self.action_type_wildcard:
tmp_set = set()
try:
fh = open(os.path.join(self._dir_path,
"__at_" + self.action_type), "rb")
for l in fh:
tmp_set.add(int(l.strip()))
if offsets is None:
offsets = tmp_set
else:
offsets &= tmp_set
except EnvironmentError as e:
if e.errno != errno.ENOENT:
raise
# If the file doesn't exist, then no actions
# with that action type were indexed.
offsets = set()
# Restrict results by key.
if not self.key_wildcard:
tmp_set = set()
try:
fh = open(os.path.join(self._dir_path,
"__st_" + self.key), "rb")
for l in fh:
tmp_set.add(int(l))
if offsets is None:
offsets = tmp_set
else:
offsets &= tmp_set
except EnvironmentError as e:
if e.errno != errno.ENOENT:
raise
# If the file doesn't exist, then no actions
# with that key were indexed.
offsets = set()
line_iter = EmptyI
# If offsets isn't None, then the set of results has been
# restricted so iterate through those offsets.
if offsets is not None:
line_iter = self.__offset_line_read(offsets)
# If offsets is None and the term was only wildcard search
# tokens, return results for every known token.
elif glob and \
not TermQuery.has_non_wildcard_character.match(term):
line_iter = self._data_main_dict.get_file_handle()
for line in line_iter:
assert not line == '\n'
tok, at_lst = \
self._data_main_dict.parse_main_dict_line(line)
# Check that the token was what was expected.
assert ((term == tok) or
(not case_sensitive and
term.lower() == tok.lower()) or
(glob and fnmatch.fnmatch(tok, term)) or
(not case_sensitive and
fnmatch.fnmatch(tok.lower(), term.lower())))
# Check the various action types this token was
# associated with.
for at, st_list in at_lst:
if not self.action_type_wildcard and \
at != self.action_type:
continue
# Check the key types this token and action type
# were associated with.
for st, fv_list in st_list:
if not self.key_wildcard and \
st != self.key:
continue
# Get the values which matched this
# token.
for fv, p_list in fv_list:
# Get the fmri id and list of
# offsets into the manifest for
# that fmri id.
for p_id, m_off_set in p_list:
p_id = int(p_id)
p_str = self._data_manf.get_entity(p_id)
# Check that the pkg
# name matches the
# restrictions, if any
# exist.
if not self.pkg_name_wildcard and not self.pkg_name_match(p_str):
continue
int_os = [
int(o)
for o
in m_off_set
]
yield (p_str, int_os,
at, st, fv)
# Close the dictionaries since there are no more results to
# yield.
self._close_dicts()
return
def _get_results(self, res):
"""Takes the results from search_internal ("res") and reads the
lines from the manifest files at the provided offsets."""
for fmri_str, offsets, at, st, fv in res:
send_res = []
f = fmri.PkgFmri(fmri_str)
path = self._manifest_path_func(f)
# XXX If action size changes substantially, we should
# reexamine whether the buffer size should be changed.
file_handle = open(path, "rb", buffering=512)
for o in sorted(offsets):
file_handle.seek(o)
send_res.append((fv, at, st,
file_handle.readline()))
file_handle.close()
for fv, at, st, l in send_res:
yield at, st, fmri_str, fv, l