src/modules/p5p.py

#!/usr/bin/python
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#

#
# Copyright (c) 2011, 2016, Oracle and/or its affiliates. All rights reserved.
#

import atexit
import collections
import errno
import tarfile as tf
import os
import shutil
import six
import sys
import tempfile
from six.moves.urllib.parse import unquote

import pkg
import pkg.client.api_errors as apx
import pkg.client.publisher
import pkg.digest as digest
import pkg.fmri
import pkg.manifest
import pkg.misc
import pkg.portable
import pkg.p5i
import pkg.pkggzip
import pkg.pkgtarfile as ptf
from pkg.misc import force_bytes, force_str

if sys.version > '3':
        long = int

class ArchiveErrors(apx.ApiException):
        """Base exception class for archive class errors."""


class InvalidArchiveIndex(ArchiveErrors):
        """Used to indicate that the specified index is in a format not
        supported or recognized by this version of the pkg(7) ArchiveIndex
        class."""

        def __init__(self, arc_name):
                ArchiveErrors.__init__(self)
                self.__name = arc_name

        def __str__(self):
                return _("{0} is not in a supported or recognizable archive "
                    "index format.").format(self.__name)


class ArchiveIndex(object):
        """Class representing a pkg(7) archive table of contents and a set of
        interfaces to populate and retrieve entries.

        Entries in this file are written in the following format:

            <name>NUL<offset>NUL<entry_size>NUL<size>NUL<typeflag>NULNL

            <name> is a string containing the pathname of the file in the
            archive.  It can be up to 65,535 bytes in length.

            <offset> is an unsigned long long integer containing the relative
            offset in bytes of the first header block for the file in the
            archive.  The offset is relative to the end of the last block of
            the first file in the archive.

            <entry_size> is an unsigned long long integer containing the size of
            the file's entry in bytes in the archive (including archive
            headers and trailers for the entry).

            <size> is an unsigned long long integer containing the size of the
            file in bytes in the archive.

            <typeflag> is a single character representing the type of the file
            in the archive.  Possible values are:
                0 Regular File
                1 Hard Link
                2 Symbolic Link
                5 Directory or subdirectory"""

        version = None
        CURRENT_VERSION = 0
        COMPATIBLE_VERSIONS = 0,
        ENTRY_FORMAT = "{0}\0{1:d}\0{2:d}\0{3:d}\0{4}\0\n"

        def __init__(self, name, mode="r", version=None):
                """Open a pkg(7) archive table of contents file.

                'name' should be the absolute path of the file to use when
                reading or writing index data.

                'mode' indicates whether the index is being used for reading
                or writing, and can be 'r' or 'w'.  Appending to or updating
                a table of contents file is not supported.

                'version' is an optional integer value specifying the version
                of the index to be read or written.  If not specified, the
                current version is assumed.
                """

                assert os.path.isabs(name)
                if version is None:
                        version = self.CURRENT_VERSION
                if version not in self.COMPATIBLE_VERSIONS:
                        raise InvalidArchiveIndex(name)

                self.__closed = False
                self.__name = name
                self.__mode = mode
                try:
                        self.__file = pkg.pkggzip.PkgGzipFile(self.__name,
                            self.__mode)
                except IOError as e:
                        if e.errno:
                                raise
                        # Underlying gzip library raises this exception if the
                        # file isn't a valid gzip file.  So, assume that if
                        # errno isn't set, this is a gzip error instead.
                        raise InvalidArchiveIndex(name)

                self.version = version

        def __exit__(self, exc_type, exc_value, exc_tb):
                """Context handler that ensures archive is automatically closed
                in a non-error condition scenario.  This enables 'with' usage.
                """
                if exc_type or exc_value or exc_tb:
                        # Only close filehandles in an error condition.
                        self.__close_fh()
                else:
                        # Close archive normally in all other cases.
                        self.close()

        @property
        def pathname(self):
                """The absolute path of the archive index file."""
                return self.__name

        def add(self, name, offset, entry_size, size, typeflag):
                """Add an entry for the given archive file to the table of
                contents."""

                # GzipFile.write requires bytes input
                self.__file.write(force_bytes(self.ENTRY_FORMAT.format(
                    name, offset, entry_size, size, typeflag)))

        def offsets(self):
                """Returns a generator that yields tuples of the form (name,
                offset) for each file in the index."""

                self.__file.seek(0)
                l = None
                try:
                        for line in self.__file:
                                # Under Python 3, indexing on a bytes will
                                # return an integer representing the
                                # unicode code point of that character; we
                                # need to use slicing to get the character.
                                if line[-2:-1] != b"\0":
                                        # Filename contained newline.
                                        if l is None:
                                                l = line
                                        else:
                                                l += b"\n"
                                                l += line
                                        continue
                                elif l is None:
                                        l = line

                                name, offset, ignored = l.split(b"\0", 2)
                                yield force_str(name), long(offset)
                                l = None
                except ValueError:
                        raise InvalidArchiveIndex(self.__name)
                except IOError as e:
                        if e.errno:
                                raise
                        # Underlying gzip library raises this exception if the
                        # file isn't a valid gzip file.  So, assume that if
                        # errno isn't set, this is a gzip error instead.
                        raise InvalidArchiveIndex(self.__name)

        def close(self):
                """Close the index.  No further operations can be performed
                using this object once closed."""

                if self.__closed:
                        return
                if self.__file:
                        self.__file.close()
                        self.__file = None
                self.__closed = True


class InvalidArchive(ArchiveErrors):
        """Used to indicate that the specified archive is in a format not
        supported or recognized by this version of the pkg(7) Archive class.
        """

        def __init__(self, arc_name):
                ArchiveErrors.__init__(self)
                self.arc_name = arc_name

        def __str__(self):
                return _("Archive {0} is missing, unsupported, or corrupt.").format(
                    self.arc_name)


class CorruptArchiveFiles(ArchiveErrors):
        """Used to indicate that the specified file(s) could not be found in the
        archive.
        """

        def __init__(self, arc_name, files):
                ArchiveErrors.__init__(self)
                self.arc_name = arc_name
                self.files = files

        def __str__(self):
                return _("Package archive {arc_name} contains corrupt "
                    "entries for the requested package file(s):\n{files}.").format(
                    arc_name=self.arc_name,
                    files="\n".join(self.files))


class UnknownArchiveFiles(ArchiveErrors):
        """Used to indicate that the specified file(s) could not be found in the
        archive.
        """

        def __init__(self, arc_name, files):
                ArchiveErrors.__init__(self)
                self.arc_name = arc_name
                self.files = files

        def __str__(self):
                return _("Package archive {arc_name} does not contain the "
                    "requested package file(s):\n{files}.").format(
                    arc_name=self.arc_name,
                    files="\n".join(self.files))


class UnknownPackageManifest(ArchiveErrors):
        """Used to indicate that a manifest for the specified package could not
        be found in the archive.
        """

        def __init__(self, arc_name, pfmri):
                ArchiveErrors.__init__(self)
                self.arc_name = arc_name
                self.pfmri = pfmri

        def __str__(self):
                return _("No package manifest for package '{pfmri}' exists "
                    "in archive {arc_name}.").format(**self.__dict__)


class Archive(object):
        """Class representing a pkg(7) archive and a set of interfaces to
        populate it and retrieve data from it.

        This class stores package data in pax archives in version 4 repository
        format.  Encoding the structure of a repository into the archive is
        necessary to enable easy composition of package archive contents with
        existing repositories and to enable consumers to access the contents of
        a package archive the same as they would a repository.

        This class can be used to access or extract the contents of almost any
        tar archive, except for those that are compressed.
        """

        __idx_pfx = "pkg5.index."
        __idx_sfx = ".gz"
        __idx_name = "pkg5.index.{0}.gz"
        __idx_ver = ArchiveIndex.CURRENT_VERSION
        __index = None
        __arc_tfile = None
        __arc_file = None
        version = None

        # If the repository format changes, then the version of the package
        # archive format should be rev'd and this updated.  (Although that isn't
        # strictly necessary, as the Repository class should remain backwards
        # compatible with this format.)
        CURRENT_VERSION = 0
        COMPATIBLE_VERSIONS = (0,)

        def __init__(self, pathname, mode="r", archive_index=None):
                """'pathname' is the absolute path of the archive file to create
                or read from.

                'mode' is a string used to indicate whether the archive is being
                opened for reading or writing, which is indicated by 'r' and 'w'
                respectively.  An archive opened for writing may not be used for
                any extraction operations, and must not already exist.

                'archive_index', if supplied is the dictionary returned by
                self.get_index(), allowing multiple Archive objects to be open,
                sharing the same index object, for efficient use of memory.
                Using an existing archive_index requires mode='r'.
                """

                assert os.path.isabs(pathname)
                self.__arc_name = pathname
                self.__closed = False
                self.__mode = mode
                self.__temp_dir = tempfile.mkdtemp()

                # Used to cache publisher objects.
                self.__pubs = None

                # Used to cache location of publisher catalog data.
                self.__catalogs = {}

                arc_mode = mode + "b"
                mode += ":"

                assert "r" in mode or "w" in mode
                assert "a" not in mode
                if "w" in mode:
                        # Don't allow overwrite of existing archive.
                        assert not os.path.exists(self.__arc_name)
                        # Ensure we're not sharing an index object.
                        assert not archive_index

                try:
                        self.__arc_file = open(self.__arc_name, arc_mode,
                            128*1024)
                except EnvironmentError as e:
                        if e.errno in (errno.ENOENT, errno.EISDIR):
                                raise InvalidArchive(self.__arc_name)
                        raise apx._convert_error(e)

                self.__queue_offset = 0
                self.__queue = collections.deque()

                # Ensure cleanup is performed on exit if the archive is not
                # explicitly closed.
                def arc_cleanup():
                        if not self.__closed:
                                self.__close_fh()
                        self.__cleanup()
                        return
                atexit.register(arc_cleanup)

                # Open the pax archive for the package.
                try:
                        self.__arc_tfile = ptf.PkgTarFile.open(mode=mode,
                            fileobj=self.__arc_file, format=tf.PAX_FORMAT)
                except EnvironmentError as e:
                        raise apx._convert_error(e)
                except Exception:
                        # Likely not an archive or the archive is corrupt.
                        raise InvalidArchive(self.__arc_name)

                self.__extract_offsets = {}
                if "r" in mode:
                        # Opening the tarfile loaded the first member, which
                        # should be the archive index file.
                        member = self.__arc_tfile.firstmember
                        if not member:
                                # Archive is empty.
                                raise InvalidArchive(self.__arc_name)

                        # If we have an archive_index use that and return
                        # immediately.  We assume that the caller has obtained
                        # the index from an exising Archive object,
                        # and will have validated the version of that archive.
                        if archive_index:
                                self.__extract_offsets = archive_index
                                return

                        if not member.name.startswith(self.__idx_pfx) or \
                            not member.name.endswith(self.__idx_sfx):
                                return
                        else:
                                self.__idx_name = member.name

                        comment = member.pax_headers.get("comment", "")
                        if not comment.startswith("pkg5.archive.version."):
                                return

                        try:
                                self.version = int(comment.rsplit(".", 1)[-1])
                        except (IndexError, ValueError):
                                raise InvalidArchive(self.__arc_name)

                        if self.version not in self.COMPATIBLE_VERSIONS:
                                raise InvalidArchive(self.__arc_name)

                        # Create a temporary file to extract the index to,
                        # and then extract it from the archive.
                        fobj, idxfn = self.__mkstemp()
                        fobj.close()
                        try:
                                self.__arc_tfile.extract_to(member,
                                    path=self.__temp_dir,
                                    filename=os.path.basename(idxfn))
                        except tf.TarError:
                                # Read error encountered.
                                raise InvalidArchive(self.__arc_name)
                        except EnvironmentError as e:
                                raise apx._convert_error(e)

                        # After extraction, the current archive file offset
                        # is the base that will be used for all other
                        # extractions.
                        index_offset = self.__arc_tfile.offset

                        # Load archive index.
                        try:
                                self.__index = ArchiveIndex(idxfn,
                                    mode="r", version=self.__idx_ver)
                                for name, offset in \
                                    self.__index.offsets():
                                        self.__extract_offsets[name] = \
                                            index_offset + offset
                        except InvalidArchiveIndex:
                                # Index is corrupt; rather than driving on
                                # and failing later, bail now.
                                os.unlink(idxfn)
                                raise InvalidArchive(self.__arc_name)
                        except EnvironmentError as e:
                                raise apx._convert_error(e)

                elif "w" in mode:
                        self.__pubs = {}

                        # Force normalization of archive member mode and
                        # ownership information during archive creation.
                        def gettarinfo(*args, **kwargs):
                                ti = ptf.PkgTarFile.gettarinfo(self.__arc_tfile,
                                    *args, **kwargs)
                                if ti.isreg():
                                        ti.mode = pkg.misc.PKG_FILE_MODE
                                elif ti.isdir():
                                        ti.mode = pkg.misc.PKG_DIR_MODE
                                if ti.name == "pkg5.index.0.gz":
                                        ti.pax_headers["comment"] = \
                                            "pkg5.archive.version.{0:d}".format(
                                            self.CURRENT_VERSION)
                                ti.uid = 0
                                ti.gid = 0
                                ti.uname = "root"
                                ti.gname = "root"
                                return ti
                        self.__arc_tfile.gettarinfo = gettarinfo

                        self.__idx_name = self.__idx_name.format(self.__idx_ver)

                        # Create a temporary file to write the index to,
                        # and then create the index.
                        fobj, idxfn = self.__mkstemp()
                        fobj.close()
                        self.__index = ArchiveIndex(idxfn, mode=arc_mode)

                        # Used to determine what the default publisher will be
                        # for the archive file at close().
                        self.__default_pub = ""

                        # Used to keep track of which package files have already
                        # been added to archive.
                        self.__processed_pfiles = set()

                        # Always create archives using current version.
                        self.version = self.CURRENT_VERSION

                        # Always add base publisher directory to start; tarfile
                        # requires an actual filesystem object to do this, so
                        # re-use an existing directory to do so.
                        self.add("/", arcname="publisher")

        def __exit__(self, exc_type, exc_value, exc_tb):
                """Context handler that ensures archive is automatically closed
                in a non-error condition scenario.  This enables 'with' usage.
                """

                if exc_type or exc_value or exc_tb:
                        # Only close file objects; don't actually write anything
                        # out in an error condition.
                        self.__close_fh()
                        return

                # Close and/or write out archive as needed.
                self.close()

        def __find_extract_offsets(self):
                """Private helper method to find offsets for individual archive
                member extraction.
                """

                if self.__extract_offsets:
                        return

                # This causes the entire archive to be read, but is the only way
                # to find the offsets to extract everything.
                try:
                        for member in self.__arc_tfile.getmembers():
                                self.__extract_offsets[member.name] = \
                                    member.offset
                except tf.TarError:
                        # Read error encountered.
                        raise InvalidArchive(self.__arc_name)
                except EnvironmentError as e:
                        raise apx._convert_error(e)

        def __mkdtemp(self):
                """Creates a temporary directory for use during archive
                operations, and return its absolute path.  The temporary
                directory will be removed after the archive is closed.
                """

                try:
                        return tempfile.mkdtemp(dir=self.__temp_dir)
                except EnvironmentError as e:
                        raise apx._convert_error(e)

        def __mkstemp(self):
                """Creates a temporary file for use during archive operations,
                and returns a file object for it and its absolute path.  The
                temporary file will be removed after the archive is closed.
                """
                try:
                        fd, fn = tempfile.mkstemp(dir=self.__temp_dir)
                        fobj = os.fdopen(fd, "w")
                except EnvironmentError as e:
                        raise apx._convert_error(e)
                return fobj, fn

        def add(self, pathname, arcname=None):
                """Queue the specified object for addition to the archive.
                The archive will be created and the object added to it when the
                close() method is called.  The target object must not change
                after this method is called while the archive is open.  The
                item being added must not already exist in the archive.

                'pathname' is an optional string specifying the absolute path
                of a file to add to the archive.  The file may be a regular
                file, directory, symbolic link, or hard link.

                'arcname' is an optional string specifying an alternative name
                for the file in the archive.  If not given, the full pathname
                provided will be used.
                """

                assert not self.__closed and "w" in self.__mode
                tfile = self.__arc_tfile
                ti = tfile.gettarinfo(pathname, arcname=arcname)
                buf = ti.tobuf(tfile.format, tfile.encoding, tfile.errors)

                # Pre-calculate size of archive entry by determining where
                # in the archive the entry would be added.
                entry_sz = len(buf)
                blocks, rem = divmod(ti.size, tf.BLOCKSIZE)
                if rem > 0:
                        blocks += 1
                entry_sz += blocks * tf.BLOCKSIZE

                # Record name, offset, entry_size, size type for each file.
                self.__index.add(ti.name, self.__queue_offset, entry_sz,
                    ti.size, ti.type)
                self.__queue_offset += entry_sz
                self.__queue.append((pathname, ti.name))

                # Discard tarinfo; it would be more efficient to keep these in
                # memory, but at a significant memory footprint cost.
                ti.tarfile = None
                del ti

        def __add_publisher_files(self, root, file_dir, hashes, fpath=None,
            repo=None):
                """Private helper function for adding package files."""

                if file_dir not in self.__processed_pfiles:
                        # Directory entry needs to be added
                        # for package files.
                        self.add(root, arcname=file_dir)
                        self.__processed_pfiles.add(file_dir)

                for fhash in hashes:
                        hash_dir = os.path.join(file_dir, fhash[:2])
                        if hash_dir not in self.__processed_pfiles:
                                # Directory entry needs to be added
                                # for hash directory.
                                self.add(root, arcname=hash_dir)
                                self.__processed_pfiles.add(hash_dir)

                        hash_fname = os.path.join(hash_dir, fhash)
                        if hash_fname in self.__processed_pfiles:
                                # Already added for a different
                                # package.
                                continue

                        if repo:
                                src = repo.file(fhash)
                        else:
                                src = os.path.join(fpath, fhash)
                        self.add(src, arcname=hash_fname)

                        # A bit expensive potentially in terms of
                        # memory usage, but necessary to prevent
                        # duplicate archive entries.
                        self.__processed_pfiles.add(hash_fname)

        def __add_package(self, pfmri, mpath, fpath=None, repo=None):
                """Private helper function that queues a package for addition to
                the archive.

                'mpath' is the absolute path of the package manifest file.

                'fpath' is an optional directory containing the package files
                stored by hash.

                'repo' is an optional Repository object to use to retrieve the
                data for the package to be added to the archive.

                'fpath' or 'repo' must be provided.
                """

                assert not self.__closed and "w" in self.__mode
                assert mpath
                assert not (fpath and repo)
                assert fpath or repo

                if not self.__default_pub:
                        self.__default_pub = pfmri.publisher

                m = pkg.manifest.Manifest(pfmri)
                m.set_content(pathname=mpath)

                # Throughout this function, the archive root directory is used
                # as a template to add other directories that should be present
                # in the archive.  This is necessary as the tarfile class does
                # not support adding arbitrary archive entries without a real
                # filesystem object as a source.
                root = os.path.dirname(self.__arc_name)
                pub_dir = os.path.join("publisher", pfmri.publisher)
                pkg_dir = os.path.join(pub_dir, "pkg")
                for d in pub_dir, pkg_dir:
                        if d not in self.__processed_pfiles:
                                self.add(root, arcname=d)
                                self.__processed_pfiles.add(d)

                # After manifest has been loaded, assume it's ok to queue the
                # manifest itself for addition to the archive.
                arcname = os.path.join(pkg_dir, pfmri.get_dir_path())

                # Entry may need to be added for manifest directory.
                man_dir = os.path.dirname(arcname)
                if man_dir not in self.__processed_pfiles:
                        self.add(root, arcname=man_dir)
                        self.__processed_pfiles.add(man_dir)

                # Entry needs to be added for manifest file.
                self.add(mpath, arcname=arcname)

                # Now add any files to the archive for every action that has a
                # payload.  (That payload can consist of multiple files.)
                file_dir = os.path.join(pub_dir, "file")
                for a in m.gen_actions():
                        if not a.has_payload:
                                # Nothing to archive.
                                continue

                        pref_hattr, hval, hfunc = \
                            digest.get_least_preferred_hash(a)
                        if not hval:
                                # Nothing to archive
                                continue

                        payloads = set([hval])

                        # Signature actions require special handling.
                        if a.name == "signature":
                                for c in a.get_chain_certs(
                                    least_preferred=True):
                                        payloads.add(c)

                                if repo:
                                        # This bit of logic only possible if
                                        # package source is a repository.
                                        pub = self.__pubs.get(pfmri.publisher,
                                            None)
                                        if not pub:
                                                self.__pubs[pfmri.publisher] = \
                                                    pub = repo.get_publisher(
                                                    pfmri.publisher)
                                                assert pub

                        if not payloads:
                                # Nothing more to do.
                                continue

                        self.__add_publisher_files(root, file_dir, payloads,
                             fpath=fpath, repo=repo)

        def add_package(self, pfmri, mpath, fpath):
                """Queues the specified package for addition to the archive.
                The archive will be created and the package added to it when
                the close() method is called.  The package contents must not
                change after this method is called while the archive is open.

                'pfmri' is the FMRI string or object identifying the package to
                add.

                'mpath' is the absolute path of the package manifest file.

                'fpath' is the directory containing the package files stored
                by hash.
                """

                assert pfmri and mpath and fpath
                if isinstance(pfmri, six.string_types):
                        pfmri = pkg.fmri.PkgFmri(pfmri)
                assert pfmri.publisher
                self.__add_package(pfmri, mpath, fpath=fpath)

        def add_repo_package(self, pfmri, repo):
                """Queues the specified package in a repository for addition to
                the archive. The archive will be created and the package added
                to it when the close() method is called.  The package contents
                must not change after this method is called while the archive is
                open.

                'pfmri' is the FMRI string or object identifying the package to
                add.

                'repo' is the Repository object to use to retrieve the data for
                the package to be added to the archive.
                """

                assert pfmri and repo
                if isinstance(pfmri, six.string_types):
                        pfmri = pkg.fmri.PkgFmri(pfmri)
                assert pfmri.publisher
                self.__add_package(pfmri, repo.manifest(pfmri), repo=repo)

        def extract_catalog1(self, part, path, pub=None):
                """Extract the named v1 catalog part to the specified directory.

                'part' is the name of the catalog file part.

                'path' is the absolute path of the directory to extract the
                file to.  It will be created automatically if it does not
                exist.

                'pub' is an optional publisher prefix.  If not provided, the
                first publisher catalog found in the archive will be used.
                """

                # If the extraction index doesn't exist, scan the
                # complete archive and build one.
                self.__find_extract_offsets()

                pubs = [
                    p for p in self.get_publishers()
                    if not pub or p.prefix == pub
                ]
                if not pubs:
                        raise UnknownArchiveFiles(self.__arc_name, [part])

                if not pub:
                        # Default to first known publisher.
                        pub = pubs[0].prefix

                # Expected locations in archive for various metadata.
                # A trailing slash is appended so that archive entry
                # comparisons skip the entries for the directory.
                pubpath = os.path.join("publisher", pub) + os.path.sep
                catpath = os.path.join(pubpath, "catalog") + os.path.sep
                partpath = os.path.join(catpath, part)

                if pub in self.__catalogs:
                        # Catalog file requested for this publisher before.
                        croot = self.__catalogs[pub]
                        if croot:
                                # Catalog data is cached because it was
                                # generated on demand, so just copy it
                                # from there to the destination.
                                src = os.path.join(croot, part)
                                if not os.path.exists(src):
                                        raise UnknownArchiveFiles(
                                            self.__arc_name, [partpath])

                                try:
                                        pkg.portable.copyfile(
                                            os.path.join(croot, part),
                                            os.path.join(path, part))
                                except EnvironmentError as e:
                                        raise apx._convert_error(e)
                        else:
                                # Use default extraction logic.
                                self.extract_to(partpath, path, filename=part)
                        return

                # Determine whether any catalog files are present for this
                # publisher in the archive.
                for name in self.__extract_offsets:
                        if name.startswith(catpath):
                                # Any catalog file at all means this publisher
                                # should be marked as being known to have one
                                # and then the request passed on to extract_to.
                                self.__catalogs[pub] = None
                                return self.extract_to(partpath, path,
                                    filename=part)

                # No catalog data found for publisher; construct a catalog
                # in memory based on packages found for publisher.
                cat = pkg.catalog.Catalog(batch_mode=True)
                manpath = os.path.join(pubpath, "pkg") + os.path.sep
                lm = None
                for name in self.__extract_offsets:
                        if name.startswith(manpath) and name.count("/") == 4:
                                ignored, stem, ver = name.rsplit("/", 2)
                                stem = unquote(stem)
                                ver = unquote(ver)
                                pfmri = pkg.fmri.PkgFmri(name=stem,
                                    publisher=pub, version=ver)

                                pfmri_tmp_ts = pfmri.get_timestamp()
                                if not lm or lm < pfmri_tmp_ts:
                                        lm = pfmri_tmp_ts

                                fobj = self.get_file(name)
                                m = pkg.manifest.Manifest(pfmri=pfmri)
                                m.set_content(content=force_str(fobj.read()),
                                    signatures=True)
                                cat.add_package(pfmri, manifest=m)

                # Store catalog in a temporary directory and mark publisher
                # as having catalog data cached.
                croot = self.__mkdtemp()
                cat.meta_root = croot
                cat.file_root = croot
                cat.batch_mode = False
                cat.finalize()
                if lm:
                        cat.last_modified = lm
                cat.save()
                self.__catalogs[pub] = croot

                # Finally, copy requested file to destination.
                try:
                        pkg.portable.copyfile(os.path.join(croot, part),
                            os.path.join(path, part))
                except EnvironmentError as e:
                        raise apx._convert_error(e)

        def extract_package_files(self, hashes, path, pub=None):
                """Extract one or more package files from the archive.

                'hashes' is a list of the files to extract named by their hash.

                'path' is the absolute path of the directory to extract the
                files to.  It will be created automatically if it does not
                exist.

                'pub' is the prefix (name) of the publisher that the package
                files are associated with.  If not provided, the first file
                named after the given hash found in the archive will be used.
                (This will be noticeably slower depending on the size of the
                archive.)
                """

                assert not self.__closed and "r" in self.__mode
                assert hashes

                # If the extraction index doesn't exist, scan the complete
                # archive and build one.
                self.__find_extract_offsets()

                if not pub:
                        # Scan extract offsets index for the first instance of
                        # any package file seen for each hash and extract the
                        # file as each is found.
                        hashes = set(hashes)

                        for name in self.__extract_offsets:
                                for fhash in hashes:
                                        hash_fname = os.path.join("file",
                                            fhash[:2], fhash)
                                        if name.endswith(hash_fname):
                                                self.extract_to(name, path,
                                                    filename=fhash)
                                                hashes.discard(fhash)
                                                break
                                if not hashes:
                                        break

                        if hashes:
                                # Any remaining hashes are for package files
                                # that couldn't be found.
                                raise UnknownArchiveFiles(self.__arc_name,
                                    hashes)
                        return

                for fhash in hashes:
                        arcname = os.path.join("publisher", pub, "file",
                            fhash[:2], fhash)
                        self.extract_to(arcname, path, filename=fhash)

        def extract_package_manifest(self, pfmri, path, filename=""):
                """Extract a package manifest from the archive.

                'pfmri' is the FMRI string or object identifying the package
                manifest to extract.

                'path' is the absolute path of the directory to extract the
                manifest to.  It will be created automatically if it does not
                exist.

                'filename' is an optional name to use for the extracted file.
                If not provided, the default behaviour is to create a directory
                named after the package stem in 'path' and a file named after
                the version in that directory; both components will be URI
                encoded.
                """

                assert not self.__closed and "r" in self.__mode
                assert pfmri and path
                if isinstance(pfmri, six.string_types):
                        pfmri = pkg.fmri.PkgFmri(pfmri)
                assert pfmri.publisher

                if not filename:
                        filename = pfmri.get_dir_path()

                arcname = os.path.join("publisher", pfmri.publisher, "pkg",
                    pfmri.get_dir_path())
                try:
                        self.extract_to(arcname, path, filename=filename)
                except UnknownArchiveFiles:
                        raise UnknownPackageManifest(self.__arc_name, pfmri)

        def extract_to(self, src, path, filename=""):
                """Extract a member from the archive.

                'src' is the pathname of the archive file to extract.

                'path' is the absolute path of the directory to extract the file
                to.

                'filename' is an optional string indicating the name to use for
                the extracted file.  If not provided, the full member name in
                the archive will be used.
                """

                assert not self.__closed and "r" in self.__mode

                # Get the offset in the archive for the given file, and then
                # seek to it.
                offset = self.__extract_offsets.get(src, None)
                tfile = self.__arc_tfile
                if offset is not None:
                        # Prepare the tarfile object for extraction by telling
                        # it where to look for the file.
                        self.__arc_file.seek(offset)
                        tfile.offset = offset

                        # Get the tarinfo object needed to extract the file.
                        try:
                                member = tf.TarInfo.fromtarfile(tfile)
                        except tf.TarError:
                                # Read error encountered.
                                raise InvalidArchive(self.__arc_name)
                        except EnvironmentError as e:
                                raise apx._convert_error(e)

                        if member.name != src:
                                # Index must be invalid or tarfile has gone off
                                # the rails trying to read the archive.
                                raise InvalidArchive(self.__arc_name)

                elif self.__extract_offsets:
                        # Assume there is no such archive member if extract
                        # offsets are known, but the item can't be found.
                        raise UnknownArchiveFiles(self.__arc_name, [src])
                else:
                        # No archive index; fallback to retrieval by name.
                        member = src

                # Extract the file to the specified location.
                try:
                        self.__arc_tfile.extract_to(member, path=path,
                            filename=filename)
                except KeyError:
                        raise UnknownArchiveFiles(self.__arc_name, [src])
                except tf.TarError:
                        # Read error encountered.
                        raise InvalidArchive(self.__arc_name)
                except EnvironmentError as e:
                        raise apx._convert_error(e)

                if not isinstance(member, tf.TarInfo):
                        # Nothing more to do.
                        return

                # If possible, validate the size of the extracted object.
                try:
                        if not filename:
                                filename = member.name
                        dest = os.path.join(path, filename)
                        if os.stat(dest).st_size != member.size:
                                raise CorruptArchiveFiles(self.__arc_name,
                                    [src])
                except EnvironmentError as e:
                        raise apx._convert_error(e)

        def get_file(self, src):
                """Returns an archive member as a file object.  If the matching
                member is a regular file, a file-like object will be returned.
                If it is a link, a file-like object is constructed from the
                link's target.  In all other cases, None will be returned.  The
                file-like object is read-only and provides methods: read(),
                readline(), readlines(), seek() and tell().  The returned object
                must be closed before the archive is, and must not be used after
                the archive is closed.

                'src' is the pathname of the archive file to return.
                """

                assert not self.__closed and "r" in self.__mode

                # Get the offset in the archive for the given file, and then
                # seek to it.
                offset = self.__extract_offsets.get(src, None)
                tfile = self.__arc_tfile
                if offset is not None:
                        # Prepare the tarfile object for extraction by telling
                        # it where to look for the file.
                        self.__arc_file.seek(offset)
                        tfile.offset = offset

                        try:
                                # Get the tarinfo object needed to extract the
                                # file.
                                member = tf.TarInfo.fromtarfile(tfile)
                        except tf.TarError:
                                # Read error encountered.
                                raise InvalidArchive(self.__arc_name)
                elif self.__extract_offsets:
                        # Assume there is no such archive member if extract
                        # offsets are known, but the item can't be found.
                        raise UnknownArchiveFiles(self.__arc_name, [src])
                else:
                        # No archive index; fallback to retrieval by name.
                        member = src

                # Finally, return the object for the matching archive member.
                try:
                        return tfile.extractfile(member)
                except KeyError:
                        raise UnknownArchiveFiles(self.__arc_name, [src])

        def get_index(self):
                """Returns the index, and extract_offsets from an Archive
                opened in read-only mode, allowing additional Archive objects
                to reuse the index, in a memory-efficient manner."""
                assert not self.__closed and "r" in self.__mode
                if not self.__extract_offsets:
                        # If the extraction index doesn't exist, scan the
                        # complete archive and build one.
                        self.__find_extract_offsets()
                return self.__extract_offsets

        def get_package_file(self, fhash, pub=None):
                """Returns the first package file matching the given hash as a
                file-like object. The file-like object is read-only and provides
                methods: read(), readline(), readlines(), seek() and tell().
                The returned object  must be closed before the archive is, and
                must not be used after the archive is closed.

                'fhash' is the hash name of the file to return.

                'pub' is the prefix (name) of the publisher that the package
                files are associated with.  If not provided, the first file
                named after the given hash found in the archive will be used.
                (This will be noticeably slower depending on the size of the
                archive.)
                """

                assert not self.__closed and "r" in self.__mode

                if not self.__extract_offsets:
                        # If the extraction index doesn't exist, scan the
                        # complete archive and build one.
                        self.__find_extract_offsets()

                if not pub:
                        # Scan extract offsets index for the first instance of
                        # any package file seen for the hash and extract it.
                        hash_fname = os.path.join("file", fhash[:2], fhash)
                        for name in self.__extract_offsets:
                                if name.endswith(hash_fname):
                                        return self.get_file(name)
                        raise UnknownArchiveFiles(self.__arc_name, [fhash])

                return self.get_file(os.path.join("publisher", pub, "file",
                    fhash[:2], fhash))

        def get_package_manifest(self, pfmri, raw=False):
                """Returns a package manifest from the archive.

                'pfmri' is the FMRI string or object identifying the package
                manifest to extract.

                'raw' is an optional boolean indicating whether the raw
                content of the Manifest should be returned.  If True,
                a file-like object containing the content of the manifest.
                If False, a Manifest object will be returned.
                """

                assert not self.__closed and "r" in self.__mode
                assert pfmri
                if isinstance(pfmri, six.string_types):
                        pfmri = pkg.fmri.PkgFmri(pfmri)
                assert pfmri.publisher

                arcname = os.path.join("publisher", pfmri.publisher, "pkg",
                    pfmri.get_dir_path())

                try:
                        fobj = self.get_file(arcname)
                except UnknownArchiveFiles:
                        raise UnknownPackageManifest(self.__arc_name, pfmri)

                if raw:
                        return fobj

                m = pkg.manifest.Manifest(pfmri=pfmri)
                m.set_content(content=force_str(fobj.read()), signatures=True)
                return m

        def get_publishers(self):
                """Return a list of publisher objects for all publishers used
                in the archive."""

                if self.__pubs:
                        return list(self.__pubs.values())

                # If the extraction index doesn't exist, scan the complete
                # archive and build one.
                self.__find_extract_offsets()

                # Search through offset index to find publishers
                # in use.
                self.__pubs = {}
                for name in self.__extract_offsets:
                        if name.count("/") == 1 and \
                            name.startswith("publisher/"):
                                ignored, pfx = name.split("/", 1)

                                # See if this publisher has a .p5i file in the
                                # archive (needed for signed packages).
                                p5iname = os.path.join("publisher", pfx,
                                    "pub.p5i")
                                try:
                                        fobj = self.get_file(p5iname)
                                except UnknownArchiveFiles:
                                        # No p5i; that's ok.
                                        pub = pkg.client.publisher.Publisher(
                                            pfx)
                                else:
                                        pubs = pkg.p5i.parse(fileobj=fobj)
                                        assert len(pubs) == 1
                                        pub = pubs[0][0]
                                        assert pub

                                self.__pubs[pfx] = pub

                return list(self.__pubs.values())

        def __cleanup(self):
                """Private helper method to cleanup temporary files."""

                try:
                        if os.path.exists(self.__temp_dir):
                                shutil.rmtree(self.__temp_dir)
                except EnvironmentError as e:
                        raise apx._convert_error(e)

        def __close_fh(self):
                """Private helper method to close filehandles."""

                # Some archives may not have an index.
                if self.__index:
                        self.__index.close()
                        self.__index = None

                # A read error during archive load may cause these to have
                # never been set.
                if self.__arc_tfile:
                        self.__arc_tfile.close()
                        self.__arc_tfile = None

                if self.__arc_file:
                        self.__arc_file.close()
                        self.__arc_file = None
                self.__closed = True

        def close(self, progtrack=None):
                """If mode is 'r', this will close the archive file.  If mode is
                'w', this will write all queued files to the archive and close
                it.  Further operations on the archive are not possible after
                calling this function."""

                assert not self.__closed

                if "w" not in self.__mode:
                        self.__close_fh()
                        self.__cleanup()
                        return

                # Add the standard pkg5.repository file before closing the
                # index.
                fobj, fname = self.__mkstemp()
                fobj.write("[CONFIGURATION]\nversion = 4\n\n"
                    "[publisher]\nprefix = {0}\n\n"
                    "[repository]\nversion = 4\n".format(self.__default_pub))
                fobj.close()
                self.add(fname, arcname="pkg5.repository")

                # If any publisher objects were cached, then there were
                # signed packages present, and p5i information for each
                # must be added to the archive.
                for pub in self.__pubs.values():
                        # A new publisher object is created with a copy of only
                        # the information that's needed for the archive.
                        npub = pkg.client.publisher.Publisher(pub.prefix,
                            alias=pub.alias,
                            revoked_ca_certs=pub.revoked_ca_certs,
                            approved_ca_certs=pub.approved_ca_certs)

                        # Create a p5i file.
                        fobj, fn = self.__mkstemp()
                        pkg.p5i.write(fobj, [npub])
                        fobj.close()

                        # Queue the p5i file for addition to the archive.
                        arcname = os.path.join("publisher", npub.prefix,
                            "pub.p5i")
                        self.add(fn, arcname=arcname)

                # Close the index; no more entries can be added.
                self.__index.close()

                # If a tracker was provided, setup a progress goal.
                idxbytes = 0
                if progtrack:
                        nfiles = len(self.__queue)
                        nbytes = self.__queue_offset
                        try:
                                fs = os.stat(self.__index.pathname)
                                nfiles += 1
                                idxbytes = fs.st_size
                                nbytes += idxbytes
                        except EnvironmentError as e:
                                raise apx._convert_error(e)

                        progtrack.archive_set_goal(
                            os.path.basename(self.__arc_name), nfiles,
                            nbytes)

                # Add the index file to the archive as the first file; it will
                # automatically be marked with a comment identifying the index
                # version.
                tfile = self.__arc_tfile
                tfile.add(self.__index.pathname, arcname=self.__idx_name)
                if progtrack:
                        progtrack.archive_add_progress(1, idxbytes)
                self.__index = None

                # Add all queued files to the archive.
                while self.__queue:
                        src, arcname = self.__queue.popleft()

                        start_offset = tfile.offset
                        tfile.add(src, arcname=arcname, recursive=False)

                        # tarfile caches member information for every item
                        # added by default, which provides fast access to the
                        # archive contents after generation, but isn't needed
                        # here (and uses a significant amount of memory).
                        # Plus popping it off the stack here allows use of
                        # the object's info to provide progress updates.
                        ti = tfile.members.pop()
                        if progtrack:
                                progtrack.archive_add_progress(1,
                                    tfile.offset - start_offset)
                        ti.tarfile = None
                        del ti

                # Cleanup temporary files.
                self.__cleanup()

                # Archive created; success!
                if progtrack:
                        progtrack.archive_done()
                self.__close_fh()

        @property
        def pathname(self):
                """The absolute path of the archive file."""
                return self.__arc_name