385 lines
17 KiB
385 lines
17 KiB
#!/usr/bin/python -u
|
|
#
|
|
# p7zr library
|
|
#
|
|
# Copyright (c) 2019 Hiroshi Miura <miurahr@linux.com>
|
|
# Copyright (c) 2004-2015 by Joachim Bauch, mail@joachim-bauch.de
|
|
# 7-Zip Copyright (C) 1999-2010 Igor Pavlov
|
|
# LZMA SDK Copyright (C) 1999-2010 Igor Pavlov
|
|
#
|
|
# This library is free software; you can redistribute it and/or
|
|
# modify it under the terms of the GNU Lesser General Public
|
|
# License as published by the Free Software Foundation; either
|
|
# version 2.1 of the License, or (at your option) any later version.
|
|
#
|
|
# This library is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
# Lesser General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU Lesser General Public
|
|
# License along with this library; if not, write to the Free Software
|
|
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
#
|
|
import bz2
|
|
import io
|
|
import lzma
|
|
import os
|
|
import queue
|
|
import sys
|
|
import threading
|
|
from typing import IO, Any, BinaryIO, Dict, List, Optional, Union
|
|
|
|
from py7zr import UnsupportedCompressionMethodError
|
|
from py7zr.extra import CopyDecompressor, DeflateDecompressor, ISevenZipDecompressor, ZstdDecompressor
|
|
from py7zr.helpers import MemIO, NullIO, calculate_crc32, readlink
|
|
from py7zr.properties import READ_BLOCKSIZE, ArchivePassword, CompressionMethod
|
|
|
|
if sys.version_info < (3, 6):
|
|
import pathlib2 as pathlib
|
|
else:
|
|
import pathlib
|
|
try:
|
|
import zstandard as Zstd # type: ignore
|
|
except ImportError:
|
|
Zstd = None
|
|
|
|
|
|
class Worker:
|
|
"""Extract worker class to invoke handler"""
|
|
|
|
def __init__(self, files, src_start: int, header) -> None:
|
|
self.target_filepath = {} # type: Dict[int, Union[MemIO, pathlib.Path, None]]
|
|
self.files = files
|
|
self.src_start = src_start
|
|
self.header = header
|
|
|
|
def extract(self, fp: BinaryIO, parallel: bool, q=None) -> None:
|
|
"""Extract worker method to handle 7zip folder and decompress each files."""
|
|
if hasattr(self.header, 'main_streams') and self.header.main_streams is not None:
|
|
src_end = self.src_start + self.header.main_streams.packinfo.packpositions[-1]
|
|
numfolders = self.header.main_streams.unpackinfo.numfolders
|
|
if numfolders == 1:
|
|
self.extract_single(fp, self.files, self.src_start, src_end, q)
|
|
else:
|
|
folders = self.header.main_streams.unpackinfo.folders
|
|
positions = self.header.main_streams.packinfo.packpositions
|
|
empty_files = [f for f in self.files if f.emptystream]
|
|
if not parallel:
|
|
self.extract_single(fp, empty_files, 0, 0, q)
|
|
for i in range(numfolders):
|
|
self.extract_single(fp, folders[i].files, self.src_start + positions[i],
|
|
self.src_start + positions[i + 1], q)
|
|
else:
|
|
filename = getattr(fp, 'name', None)
|
|
self.extract_single(open(filename, 'rb'), empty_files, 0, 0, q)
|
|
extract_threads = []
|
|
for i in range(numfolders):
|
|
p = threading.Thread(target=self.extract_single,
|
|
args=(filename, folders[i].files,
|
|
self.src_start + positions[i], self.src_start + positions[i + 1], q))
|
|
p.start()
|
|
extract_threads.append((p))
|
|
for p in extract_threads:
|
|
p.join()
|
|
else:
|
|
empty_files = [f for f in self.files if f.emptystream]
|
|
self.extract_single(fp, empty_files, 0, 0, q)
|
|
|
|
def extract_single(self, fp: Union[BinaryIO, str], files, src_start: int, src_end: int,
|
|
q: Optional[queue.Queue]) -> None:
|
|
"""Single thread extractor that takes file lists in single 7zip folder."""
|
|
if files is None:
|
|
return
|
|
if isinstance(fp, str):
|
|
fp = open(fp, 'rb')
|
|
fp.seek(src_start)
|
|
for f in files:
|
|
if q is not None:
|
|
q.put(('s', str(f.filename), str(f.compressed) if f.compressed is not None else '0'))
|
|
fileish = self.target_filepath.get(f.id, None)
|
|
if fileish is not None:
|
|
fileish.parent.mkdir(parents=True, exist_ok=True)
|
|
with fileish.open(mode='wb') as ofp:
|
|
if not f.emptystream:
|
|
# extract to file
|
|
self.decompress(fp, f.folder, ofp, f.uncompressed[-1], f.compressed, src_end)
|
|
ofp.seek(0)
|
|
else:
|
|
pass # just create empty file
|
|
elif not f.emptystream:
|
|
# read and bin off a data but check crc
|
|
with NullIO() as ofp:
|
|
self.decompress(fp, f.folder, ofp, f.uncompressed[-1], f.compressed, src_end)
|
|
if q is not None:
|
|
q.put(('e', str(f.filename), str(f.uncompressed[-1])))
|
|
|
|
def decompress(self, fp: BinaryIO, folder, fq: IO[Any],
|
|
size: int, compressed_size: Optional[int], src_end: int) -> None:
|
|
"""decompressor wrapper called from extract method.
|
|
|
|
:parameter fp: archive source file pointer
|
|
:parameter folder: Folder object that have decompressor object.
|
|
:parameter fq: output file pathlib.Path
|
|
:parameter size: uncompressed size of target file.
|
|
:parameter compressed_size: compressed size of target file.
|
|
:parameter src_end: end position of the folder
|
|
:returns None
|
|
"""
|
|
assert folder is not None
|
|
out_remaining = size
|
|
decompressor = folder.get_decompressor(compressed_size)
|
|
while out_remaining > 0:
|
|
max_length = min(out_remaining, io.DEFAULT_BUFFER_SIZE)
|
|
rest_size = src_end - fp.tell()
|
|
read_size = min(READ_BLOCKSIZE, rest_size)
|
|
if read_size == 0:
|
|
tmp = decompressor.decompress(b'', max_length)
|
|
if len(tmp) == 0:
|
|
raise Exception("decompression get wrong: no output data.")
|
|
else:
|
|
inp = fp.read(read_size)
|
|
tmp = decompressor.decompress(inp, max_length)
|
|
if len(tmp) > 0 and out_remaining >= len(tmp):
|
|
out_remaining -= len(tmp)
|
|
fq.write(tmp)
|
|
if out_remaining <= 0:
|
|
break
|
|
if fp.tell() >= src_end:
|
|
if decompressor.crc is not None and not decompressor.check_crc():
|
|
print('\nCRC error! expected: {}, real: {}'.format(decompressor.crc, decompressor.digest))
|
|
return
|
|
|
|
def _find_link_target(self, target):
|
|
"""Find the target member of a symlink or hardlink member in the archive.
|
|
"""
|
|
targetname = target.as_posix() # type: str
|
|
linkname = readlink(targetname)
|
|
# Check windows full path symlinks
|
|
if linkname.startswith("\\\\?\\"):
|
|
linkname = linkname[4:]
|
|
# normalize as posix style
|
|
linkname = pathlib.Path(linkname).as_posix() # type: str
|
|
member = None
|
|
for j in range(len(self.files)):
|
|
if linkname == self.files[j].origin.as_posix():
|
|
# FIXME: when API user specify arcname, it will break
|
|
member = os.path.relpath(linkname, os.path.dirname(targetname))
|
|
break
|
|
if member is None:
|
|
member = linkname
|
|
return member
|
|
|
|
def archive(self, fp: BinaryIO, folder, deref=False):
|
|
"""Run archive task for specified 7zip folder."""
|
|
compressor = folder.get_compressor()
|
|
outsize = 0
|
|
self.header.main_streams.packinfo.numstreams = 1
|
|
num_unpack_streams = 0
|
|
self.header.main_streams.substreamsinfo.digests = []
|
|
self.header.main_streams.substreamsinfo.digestsdefined = []
|
|
last_file_index = 0
|
|
foutsize = 0
|
|
for i, f in enumerate(self.files):
|
|
file_info = f.file_properties()
|
|
self.header.files_info.files.append(file_info)
|
|
self.header.files_info.emptyfiles.append(f.emptystream)
|
|
foutsize = 0
|
|
if f.is_symlink and not deref:
|
|
last_file_index = i
|
|
num_unpack_streams += 1
|
|
link_target = self._find_link_target(f.origin) # type: str
|
|
tgt = link_target.encode('utf-8') # type: bytes
|
|
insize = len(tgt)
|
|
crc = calculate_crc32(tgt, 0) # type: int
|
|
out = compressor.compress(tgt)
|
|
outsize += len(out)
|
|
foutsize += len(out)
|
|
fp.write(out)
|
|
self.header.main_streams.substreamsinfo.digests.append(crc)
|
|
self.header.main_streams.substreamsinfo.digestsdefined.append(True)
|
|
self.header.main_streams.substreamsinfo.unpacksizes.append(insize)
|
|
self.header.files_info.files[i]['maxsize'] = foutsize
|
|
elif not f.emptystream:
|
|
last_file_index = i
|
|
num_unpack_streams += 1
|
|
insize = 0
|
|
with f.origin.open(mode='rb') as fd:
|
|
data = fd.read(READ_BLOCKSIZE)
|
|
insize += len(data)
|
|
crc = 0
|
|
while data:
|
|
crc = calculate_crc32(data, crc)
|
|
out = compressor.compress(data)
|
|
outsize += len(out)
|
|
foutsize += len(out)
|
|
fp.write(out)
|
|
data = fd.read(READ_BLOCKSIZE)
|
|
insize += len(data)
|
|
self.header.main_streams.substreamsinfo.digests.append(crc)
|
|
self.header.main_streams.substreamsinfo.digestsdefined.append(True)
|
|
self.header.files_info.files[i]['maxsize'] = foutsize
|
|
self.header.main_streams.substreamsinfo.unpacksizes.append(insize)
|
|
else:
|
|
out = compressor.flush()
|
|
outsize += len(out)
|
|
foutsize += len(out)
|
|
fp.write(out)
|
|
if len(self.files) > 0:
|
|
self.header.files_info.files[last_file_index]['maxsize'] = foutsize
|
|
# Update size data in header
|
|
self.header.main_streams.packinfo.packsizes = [outsize]
|
|
folder.unpacksizes = [sum(self.header.main_streams.substreamsinfo.unpacksizes)]
|
|
self.header.main_streams.substreamsinfo.num_unpackstreams_folders = [num_unpack_streams]
|
|
|
|
def register_filelike(self, id: int, fileish: Union[MemIO, pathlib.Path, None]) -> None:
|
|
"""register file-ish to worker."""
|
|
self.target_filepath[id] = fileish
|
|
|
|
|
|
class SevenZipDecompressor:
|
|
"""Main decompressor object which is properly configured and bind to each 7zip folder.
|
|
because 7zip folder can have a custom compression method"""
|
|
|
|
lzma_methods_map = {
|
|
CompressionMethod.LZMA: lzma.FILTER_LZMA1,
|
|
CompressionMethod.LZMA2: lzma.FILTER_LZMA2,
|
|
CompressionMethod.DELTA: lzma.FILTER_DELTA,
|
|
CompressionMethod.P7Z_BCJ: lzma.FILTER_X86,
|
|
CompressionMethod.BCJ_ARM: lzma.FILTER_ARM,
|
|
CompressionMethod.BCJ_ARMT: lzma.FILTER_ARMTHUMB,
|
|
CompressionMethod.BCJ_IA64: lzma.FILTER_IA64,
|
|
CompressionMethod.BCJ_PPC: lzma.FILTER_POWERPC,
|
|
CompressionMethod.BCJ_SPARC: lzma.FILTER_SPARC,
|
|
}
|
|
|
|
FILTER_BZIP2 = 0x31
|
|
FILTER_ZIP = 0x32
|
|
FILTER_COPY = 0x33
|
|
FILTER_AES = 0x34
|
|
FILTER_ZSTD = 0x35
|
|
alt_methods_map = {
|
|
CompressionMethod.MISC_BZIP2: FILTER_BZIP2,
|
|
CompressionMethod.MISC_DEFLATE: FILTER_ZIP,
|
|
CompressionMethod.COPY: FILTER_COPY,
|
|
CompressionMethod.CRYPT_AES256_SHA256: FILTER_AES,
|
|
CompressionMethod.MISC_ZSTD: FILTER_ZSTD,
|
|
}
|
|
|
|
def __init__(self, coders: List[Dict[str, Any]], size: int, crc: Optional[int]) -> None:
|
|
# Get password which was set when creation of py7zr.SevenZipFile object.
|
|
self.input_size = size
|
|
self.consumed = 0 # type: int
|
|
self.crc = crc
|
|
self.digest = None # type: Optional[int]
|
|
if self._check_lzma_coders(coders):
|
|
self._set_lzma_decompressor(coders)
|
|
else:
|
|
self._set_alternative_decompressor(coders)
|
|
|
|
def _check_lzma_coders(self, coders: List[Dict[str, Any]]) -> bool:
|
|
res = True
|
|
for coder in coders:
|
|
if self.lzma_methods_map.get(coder['method'], None) is None:
|
|
res = False
|
|
break
|
|
return res
|
|
|
|
def _set_lzma_decompressor(self, coders: List[Dict[str, Any]]) -> None:
|
|
filters = [] # type: List[Dict[str, Any]]
|
|
for coder in coders:
|
|
if coder['numinstreams'] != 1 or coder['numoutstreams'] != 1:
|
|
raise UnsupportedCompressionMethodError('Only a simple compression method is currently supported.')
|
|
filter_id = self.lzma_methods_map.get(coder['method'], None)
|
|
if filter_id is None:
|
|
raise UnsupportedCompressionMethodError
|
|
properties = coder.get('properties', None)
|
|
if properties is not None:
|
|
filters[:0] = [lzma._decode_filter_properties(filter_id, properties)] # type: ignore
|
|
else:
|
|
filters[:0] = [{'id': filter_id}]
|
|
self.decompressor = lzma.LZMADecompressor(format=lzma.FORMAT_RAW, filters=filters) # type: Union[bz2.BZ2Decompressor, lzma.LZMADecompressor, ISevenZipDecompressor] # noqa
|
|
|
|
def _set_alternative_decompressor(self, coders: List[Dict[str, Any]]) -> None:
|
|
filter_id = self.alt_methods_map.get(coders[0]['method'], None)
|
|
if filter_id == self.FILTER_BZIP2:
|
|
self.decompressor = bz2.BZ2Decompressor()
|
|
elif filter_id == self.FILTER_ZIP:
|
|
self.decompressor = DeflateDecompressor()
|
|
elif filter_id == self.FILTER_COPY:
|
|
self.decompressor = CopyDecompressor()
|
|
elif filter_id == self.FILTER_ZSTD and Zstd:
|
|
self.decompressor = ZstdDecompressor()
|
|
else:
|
|
raise UnsupportedCompressionMethodError
|
|
|
|
def decompress(self, data: bytes, max_length: Optional[int] = None) -> bytes:
|
|
self.consumed += len(data)
|
|
if max_length is not None:
|
|
folder_data = self.decompressor.decompress(data, max_length=max_length)
|
|
else:
|
|
folder_data = self.decompressor.decompress(data)
|
|
# calculate CRC with uncompressed data
|
|
if self.crc is not None:
|
|
self.digest = calculate_crc32(folder_data, self.digest)
|
|
return folder_data
|
|
|
|
def check_crc(self):
|
|
return self.crc == self.digest
|
|
|
|
|
|
class SevenZipCompressor:
|
|
|
|
"""Main compressor object to configured for each 7zip folder."""
|
|
|
|
__slots__ = ['filters', 'compressor', 'coders']
|
|
|
|
lzma_methods_map_r = {
|
|
lzma.FILTER_LZMA2: CompressionMethod.LZMA2,
|
|
lzma.FILTER_DELTA: CompressionMethod.DELTA,
|
|
lzma.FILTER_X86: CompressionMethod.P7Z_BCJ,
|
|
}
|
|
|
|
def __init__(self, filters=None):
|
|
if filters is None:
|
|
self.filters = [{"id": lzma.FILTER_LZMA2, "preset": 7 | lzma.PRESET_EXTREME}, ]
|
|
else:
|
|
self.filters = filters
|
|
self.compressor = lzma.LZMACompressor(format=lzma.FORMAT_RAW, filters=self.filters)
|
|
self.coders = []
|
|
for filter in self.filters:
|
|
if filter is None:
|
|
break
|
|
method = self.lzma_methods_map_r[filter['id']]
|
|
properties = lzma._encode_filter_properties(filter)
|
|
self.coders.append({'method': method, 'properties': properties, 'numinstreams': 1, 'numoutstreams': 1})
|
|
|
|
def compress(self, data):
|
|
return self.compressor.compress(data)
|
|
|
|
def flush(self):
|
|
return self.compressor.flush()
|
|
|
|
|
|
def get_methods_names(coders: List[dict]) -> List[str]:
|
|
"""Return human readable method names for specified coders"""
|
|
methods_name_map = {
|
|
CompressionMethod.LZMA2: "LZMA2",
|
|
CompressionMethod.LZMA: "LZMA",
|
|
CompressionMethod.DELTA: "delta",
|
|
CompressionMethod.P7Z_BCJ: "BCJ",
|
|
CompressionMethod.BCJ_ARM: "BCJ(ARM)",
|
|
CompressionMethod.BCJ_ARMT: "BCJ(ARMT)",
|
|
CompressionMethod.BCJ_IA64: "BCJ(IA64)",
|
|
CompressionMethod.BCJ_PPC: "BCJ(POWERPC)",
|
|
CompressionMethod.BCJ_SPARC: "BCJ(SPARC)",
|
|
CompressionMethod.CRYPT_AES256_SHA256: "7zAES",
|
|
}
|
|
methods_names = [] # type: List[str]
|
|
for coder in coders:
|
|
try:
|
|
methods_names.append(methods_name_map[coder['method']])
|
|
except KeyError:
|
|
raise UnsupportedCompressionMethodError("Unknown method {}".format(coder['method']))
|
|
return methods_names
|