bazarr/libs/gitdb/db/pack.py

# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors
#
# This module is part of GitDB and is released under
# the New BSD License: http://www.opensource.org/licenses/bsd-license.php
"""Module containing a database to deal with packs"""
from gitdb.db.base import (
    FileDBBase,
    ObjectDBR,
    CachingDB
)

from gitdb.util import LazyMixin

from gitdb.exc import (
    BadObject,
    UnsupportedOperation,
    AmbiguousObjectName
)

from gitdb.pack import PackEntity
from gitdb.utils.compat import xrange

from functools import reduce

import os
import glob

__all__ = ('PackedDB', )

#{ Utilities


class PackedDB(FileDBBase, ObjectDBR, CachingDB, LazyMixin):

    """A database operating on a set of object packs"""

    # sort the priority list every N queries
    # Higher values are better, performance tests don't show this has
    # any effect, but it should have one
    _sort_interval = 500

    def __init__(self, root_path):
        super(PackedDB, self).__init__(root_path)
        # list of lists with three items:
        # * hits - number of times the pack was hit with a request
        # * entity - Pack entity instance
        # * sha_to_index - PackIndexFile.sha_to_index method for direct cache query
        # self._entities = list()       # lazy loaded list
        self._hit_count = 0             # amount of hits
        self._st_mtime = 0              # last modification data of our root path

    def _set_cache_(self, attr):
        if attr == '_entities':
            self._entities = list()
            self.update_cache(force=True)
        # END handle entities initialization

    def _sort_entities(self):
        self._entities.sort(key=lambda l: l[0], reverse=True)

    def _pack_info(self, sha):
        """:return: tuple(entity, index) for an item at the given sha
        :param sha: 20 or 40 byte sha
        :raise BadObject:
        **Note:** This method is not thread-safe, but may be hit in multi-threaded
            operation. The worst thing that can happen though is a counter that
            was not incremented, or the list being in wrong order. So we safe
            the time for locking here, lets see how that goes"""
        # presort ?
        if self._hit_count % self._sort_interval == 0:
            self._sort_entities()
        # END update sorting

        for item in self._entities:
            index = item[2](sha)
            if index is not None:
                item[0] += 1            # one hit for you
                self._hit_count += 1    # general hit count
                return (item[1], index)
            # END index found in pack
        # END for each item

        # no hit, see whether we have to update packs
        # NOTE: considering packs don't change very often, we safe this call
        # and leave it to the super-caller to trigger that
        raise BadObject(sha)

    #{ Object DB Read

    def has_object(self, sha):
        try:
            self._pack_info(sha)
            return True
        except BadObject:
            return False
        # END exception handling

    def info(self, sha):
        entity, index = self._pack_info(sha)
        return entity.info_at_index(index)

    def stream(self, sha):
        entity, index = self._pack_info(sha)
        return entity.stream_at_index(index)

    def sha_iter(self):
        for entity in self.entities():
            index = entity.index()
            sha_by_index = index.sha
            for index in xrange(index.size()):
                yield sha_by_index(index)
            # END for each index
        # END for each entity

    def size(self):
        sizes = [item[1].index().size() for item in self._entities]
        return reduce(lambda x, y: x + y, sizes, 0)

    #} END object db read

    #{ object db write

    def store(self, istream):
        """Storing individual objects is not feasible as a pack is designed to
        hold multiple objects. Writing or rewriting packs for single objects is
        inefficient"""
        raise UnsupportedOperation()

    #} END object db write

    #{ Interface

    def update_cache(self, force=False):
        """
        Update our cache with the acutally existing packs on disk. Add new ones,
        and remove deleted ones. We keep the unchanged ones

        :param force: If True, the cache will be updated even though the directory
            does not appear to have changed according to its modification timestamp.
        :return: True if the packs have been updated so there is new information,
            False if there was no change to the pack database"""
        stat = os.stat(self.root_path())
        if not force and stat.st_mtime <= self._st_mtime:
            return False
        # END abort early on no change
        self._st_mtime = stat.st_mtime

        # packs are supposed to be prefixed with pack- by git-convention
        # get all pack files, figure out what changed
        pack_files = set(glob.glob(os.path.join(self.root_path(), "pack-*.pack")))
        our_pack_files = set(item[1].pack().path() for item in self._entities)

        # new packs
        for pack_file in (pack_files - our_pack_files):
            # init the hit-counter/priority with the size, a good measure for hit-
            # probability. Its implemented so that only 12 bytes will be read
            entity = PackEntity(pack_file)
            self._entities.append([entity.pack().size(), entity, entity.index().sha_to_index])
        # END for each new packfile

        # removed packs
        for pack_file in (our_pack_files - pack_files):
            del_index = -1
            for i, item in enumerate(self._entities):
                if item[1].pack().path() == pack_file:
                    del_index = i
                    break
                # END found index
            # END for each entity
            assert del_index != -1
            del(self._entities[del_index])
        # END for each removed pack

        # reinitialize prioritiess
        self._sort_entities()
        return True

    def entities(self):
        """:return: list of pack entities operated upon by this database"""
        return [item[1] for item in self._entities]

    def partial_to_complete_sha(self, partial_binsha, canonical_length):
        """:return: 20 byte sha as inferred by the given partial binary sha
        :param partial_binsha: binary sha with less than 20 bytes
        :param canonical_length: length of the corresponding canonical representation.
            It is required as binary sha's cannot display whether the original hex sha
            had an odd or even number of characters
        :raise AmbiguousObjectName:
        :raise BadObject: """
        candidate = None
        for item in self._entities:
            item_index = item[1].index().partial_sha_to_index(partial_binsha, canonical_length)
            if item_index is not None:
                sha = item[1].index().sha(item_index)
                if candidate and candidate != sha:
                    raise AmbiguousObjectName(partial_binsha)
                candidate = sha
            # END handle full sha could be found
        # END for each entity

        if candidate:
            return candidate

        # still not found ?
        raise BadObject(partial_binsha)

    #} END interface
Include dependencies and remove requirements.txt 6 years ago			`# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors`
			`#`
			`# This module is part of GitDB and is released under`
			`# the New BSD License: http://www.opensource.org/licenses/bsd-license.php`
			`"""Module containing a database to deal with packs"""`
			`from gitdb.db.base import (`
			`FileDBBase,`
			`ObjectDBR,`
			`CachingDB`
			`)`

			`from gitdb.util import LazyMixin`

			`from gitdb.exc import (`
			`BadObject,`
			`UnsupportedOperation,`
			`AmbiguousObjectName`
			`)`

			`from gitdb.pack import PackEntity`
			`from gitdb.utils.compat import xrange`

			`from functools import reduce`

			`import os`
			`import glob`

			`__all__ = ('PackedDB', )`

			`#{ Utilities`


			`class PackedDB(FileDBBase, ObjectDBR, CachingDB, LazyMixin):`

			`"""A database operating on a set of object packs"""`

			`# sort the priority list every N queries`
			`# Higher values are better, performance tests don't show this has`
			`# any effect, but it should have one`
			`_sort_interval = 500`

			`def __init__(self, root_path):`
			`super(PackedDB, self).__init__(root_path)`
			`# list of lists with three items:`
			`# * hits - number of times the pack was hit with a request`
			`# * entity - Pack entity instance`
			`# * sha_to_index - PackIndexFile.sha_to_index method for direct cache query`
			`# self._entities = list() # lazy loaded list`
			`self._hit_count = 0 # amount of hits`
			`self._st_mtime = 0 # last modification data of our root path`

			`def _set_cache_(self, attr):`
			`if attr == '_entities':`
			`self._entities = list()`
			`self.update_cache(force=True)`
			`# END handle entities initialization`

			`def _sort_entities(self):`
			`self._entities.sort(key=lambda l: l[0], reverse=True)`

			`def _pack_info(self, sha):`
			`""":return: tuple(entity, index) for an item at the given sha`
			`:param sha: 20 or 40 byte sha`
			`:raise BadObject:`
			`Note: This method is not thread-safe, but may be hit in multi-threaded`
			`operation. The worst thing that can happen though is a counter that`
			`was not incremented, or the list being in wrong order. So we safe`
			`the time for locking here, lets see how that goes"""`
			`# presort ?`
			`if self._hit_count % self._sort_interval == 0:`
			`self._sort_entities()`
			`# END update sorting`

			`for item in self._entities:`
			`index = item[2](sha)`
			`if index is not None:`
			`item[0] += 1 # one hit for you`
			`self._hit_count += 1 # general hit count`
			`return (item[1], index)`
			`# END index found in pack`
			`# END for each item`

			`# no hit, see whether we have to update packs`
			`# NOTE: considering packs don't change very often, we safe this call`
			`# and leave it to the super-caller to trigger that`
			`raise BadObject(sha)`

			`#{ Object DB Read`

			`def has_object(self, sha):`
			`try:`
			`self._pack_info(sha)`
			`return True`
			`except BadObject:`
			`return False`
			`# END exception handling`

			`def info(self, sha):`
			`entity, index = self._pack_info(sha)`
			`return entity.info_at_index(index)`

			`def stream(self, sha):`
			`entity, index = self._pack_info(sha)`
			`return entity.stream_at_index(index)`

			`def sha_iter(self):`
			`for entity in self.entities():`
			`index = entity.index()`
			`sha_by_index = index.sha`
			`for index in xrange(index.size()):`
			`yield sha_by_index(index)`
			`# END for each index`
			`# END for each entity`

			`def size(self):`
			`sizes = [item[1].index().size() for item in self._entities]`
			`return reduce(lambda x, y: x + y, sizes, 0)`

			`#} END object db read`

			`#{ object db write`

			`def store(self, istream):`
			`"""Storing individual objects is not feasible as a pack is designed to`
			`hold multiple objects. Writing or rewriting packs for single objects is`
			`inefficient"""`
			`raise UnsupportedOperation()`

			`#} END object db write`

			`#{ Interface`

			`def update_cache(self, force=False):`
			`"""`
			`Update our cache with the acutally existing packs on disk. Add new ones,`
			`and remove deleted ones. We keep the unchanged ones`

			`:param force: If True, the cache will be updated even though the directory`
			`does not appear to have changed according to its modification timestamp.`
			`:return: True if the packs have been updated so there is new information,`
			`False if there was no change to the pack database"""`
			`stat = os.stat(self.root_path())`
			`if not force and stat.st_mtime <= self._st_mtime:`
			`return False`
			`# END abort early on no change`
			`self._st_mtime = stat.st_mtime`

			`# packs are supposed to be prefixed with pack- by git-convention`
			`# get all pack files, figure out what changed`
			`pack_files = set(glob.glob(os.path.join(self.root_path(), "pack-*.pack")))`
			`our_pack_files = set(item[1].pack().path() for item in self._entities)`

			`# new packs`
			`for pack_file in (pack_files - our_pack_files):`
			`# init the hit-counter/priority with the size, a good measure for hit-`
			`# probability. Its implemented so that only 12 bytes will be read`
			`entity = PackEntity(pack_file)`
			`self._entities.append([entity.pack().size(), entity, entity.index().sha_to_index])`
			`# END for each new packfile`

			`# removed packs`
			`for pack_file in (our_pack_files - pack_files):`
			`del_index = -1`
			`for i, item in enumerate(self._entities):`
			`if item[1].pack().path() == pack_file:`
			`del_index = i`
			`break`
			`# END found index`
			`# END for each entity`
			`assert del_index != -1`
			`del(self._entities[del_index])`
			`# END for each removed pack`

			`# reinitialize prioritiess`
			`self._sort_entities()`
			`return True`

			`def entities(self):`
			`""":return: list of pack entities operated upon by this database"""`
			`return [item[1] for item in self._entities]`

			`def partial_to_complete_sha(self, partial_binsha, canonical_length):`
			`""":return: 20 byte sha as inferred by the given partial binary sha`
			`:param partial_binsha: binary sha with less than 20 bytes`
			`:param canonical_length: length of the corresponding canonical representation.`
			`It is required as binary sha's cannot display whether the original hex sha`
			`had an odd or even number of characters`
			`:raise AmbiguousObjectName:`
			`:raise BadObject: """`
			`candidate = None`
			`for item in self._entities:`
			`item_index = item[1].index().partial_sha_to_index(partial_binsha, canonical_length)`
			`if item_index is not None:`
			`sha = item[1].index().sha(item_index)`
			`if candidate and candidate != sha:`
			`raise AmbiguousObjectName(partial_binsha)`
			`candidate = sha`
			`# END handle full sha could be found`
			`# END for each entity`

			`if candidate:`
			`return candidate`

			`# still not found ?`
			`raise BadObject(partial_binsha)`

			`#} END interface`