313 lines
10 KiB
313 lines
10 KiB
# -*- coding: utf-8 -*-
|
|
import os
|
|
import sys
|
|
import codecs
|
|
|
|
try:
|
|
from collections import UserList
|
|
except ImportError:
|
|
from UserList import UserList
|
|
|
|
from itertools import chain
|
|
from copy import copy
|
|
|
|
from pysrt.srtexc import Error
|
|
from pysrt.srtitem import SubRipItem
|
|
from pysrt.compat import str
|
|
|
|
BOMS = ((codecs.BOM_UTF32_LE, 'utf_32_le'),
|
|
(codecs.BOM_UTF32_BE, 'utf_32_be'),
|
|
(codecs.BOM_UTF16_LE, 'utf_16_le'),
|
|
(codecs.BOM_UTF16_BE, 'utf_16_be'),
|
|
(codecs.BOM_UTF8, 'utf_8'))
|
|
CODECS_BOMS = dict((codec, str(bom, codec)) for bom, codec in BOMS)
|
|
BIGGER_BOM = max(len(bom) for bom, encoding in BOMS)
|
|
|
|
|
|
class SubRipFile(UserList, object):
|
|
"""
|
|
SubRip file descriptor.
|
|
|
|
Provide a pure Python mapping on all metadata.
|
|
|
|
SubRipFile(items, eol, path, encoding)
|
|
|
|
items -> list of SubRipItem. Default to [].
|
|
eol -> str: end of line character. Default to linesep used in opened file
|
|
if any else to os.linesep.
|
|
path -> str: path where file will be saved. To open an existant file see
|
|
SubRipFile.open.
|
|
encoding -> str: encoding used at file save. Default to utf-8.
|
|
"""
|
|
ERROR_PASS = 0
|
|
ERROR_LOG = 1
|
|
ERROR_RAISE = 2
|
|
|
|
DEFAULT_ENCODING = 'utf_8'
|
|
|
|
def __init__(self, items=None, eol=None, path=None, encoding='utf-8'):
|
|
UserList.__init__(self, items or [])
|
|
self._eol = eol
|
|
self.path = path
|
|
self.encoding = encoding
|
|
|
|
def _get_eol(self):
|
|
return self._eol or os.linesep
|
|
|
|
def _set_eol(self, eol):
|
|
self._eol = self._eol or eol
|
|
|
|
eol = property(_get_eol, _set_eol)
|
|
|
|
def slice(self, starts_before=None, starts_after=None, ends_before=None,
|
|
ends_after=None):
|
|
"""
|
|
slice([starts_before][, starts_after][, ends_before][, ends_after]) \
|
|
-> SubRipFile clone
|
|
|
|
All arguments are optional, and should be coercible to SubRipTime
|
|
object.
|
|
|
|
It reduce the set of subtitles to those that match match given time
|
|
constraints.
|
|
|
|
The returned set is a clone, but still contains references to original
|
|
subtitles. So if you shift this returned set, subs contained in the
|
|
original SubRipFile instance will be altered too.
|
|
|
|
Example:
|
|
>>> subs.slice(ends_after={'seconds': 20}).shift(seconds=2)
|
|
"""
|
|
clone = copy(self)
|
|
|
|
if starts_before:
|
|
clone.data = (i for i in clone.data if i.start < starts_before)
|
|
if starts_after:
|
|
clone.data = (i for i in clone.data if i.start > starts_after)
|
|
if ends_before:
|
|
clone.data = (i for i in clone.data if i.end < ends_before)
|
|
if ends_after:
|
|
clone.data = (i for i in clone.data if i.end > ends_after)
|
|
|
|
clone.data = list(clone.data)
|
|
return clone
|
|
|
|
def at(self, timestamp=None, **kwargs):
|
|
"""
|
|
at(timestamp) -> SubRipFile clone
|
|
|
|
timestamp argument should be coercible to SubRipFile object.
|
|
|
|
A specialization of slice. Return all subtiles visible at the
|
|
timestamp mark.
|
|
|
|
Example:
|
|
>>> subs.at((0, 0, 20, 0)).shift(seconds=2)
|
|
>>> subs.at(seconds=20).shift(seconds=2)
|
|
"""
|
|
time = timestamp or kwargs
|
|
return self.slice(starts_before=time, ends_after=time)
|
|
|
|
def shift(self, *args, **kwargs):
|
|
"""shift(hours, minutes, seconds, milliseconds, ratio)
|
|
|
|
Shift `start` and `end` attributes of each items of file either by
|
|
applying a ratio or by adding an offset.
|
|
|
|
`ratio` should be either an int or a float.
|
|
Example to convert subtitles from 23.9 fps to 25 fps:
|
|
>>> subs.shift(ratio=25/23.9)
|
|
|
|
All "time" arguments are optional and have a default value of 0.
|
|
Example to delay all subs from 2 seconds and half
|
|
>>> subs.shift(seconds=2, milliseconds=500)
|
|
"""
|
|
for item in self:
|
|
item.shift(*args, **kwargs)
|
|
|
|
def clean_indexes(self):
|
|
"""
|
|
clean_indexes()
|
|
|
|
Sort subs and reset their index attribute. Should be called after
|
|
destructive operations like split or such.
|
|
"""
|
|
self.sort()
|
|
for index, item in enumerate(self):
|
|
item.index = index + 1
|
|
|
|
@property
|
|
def text(self):
|
|
return '\n'.join(i.text for i in self)
|
|
|
|
@classmethod
|
|
def open(cls, path='', encoding=None, error_handling=ERROR_PASS):
|
|
"""
|
|
open([path, [encoding]])
|
|
|
|
If you do not provide any encoding, it can be detected if the file
|
|
contain a bit order mark, unless it is set to utf-8 as default.
|
|
"""
|
|
source_file, encoding = cls._open_unicode_file(path, claimed_encoding=encoding)
|
|
new_file = cls(path=path, encoding=encoding)
|
|
new_file.read(source_file, error_handling=error_handling)
|
|
source_file.close()
|
|
return new_file
|
|
|
|
@classmethod
|
|
def from_string(cls, source, **kwargs):
|
|
"""
|
|
from_string(source, **kwargs) -> SubRipFile
|
|
|
|
`source` -> a unicode instance or at least a str instance encoded with
|
|
`sys.getdefaultencoding()`
|
|
"""
|
|
error_handling = kwargs.pop('error_handling', None)
|
|
new_file = cls(**kwargs)
|
|
new_file.read(source.splitlines(True), error_handling=error_handling)
|
|
return new_file
|
|
|
|
def read(self, source_file, error_handling=ERROR_PASS):
|
|
"""
|
|
read(source_file, [error_handling])
|
|
|
|
This method parse subtitles contained in `source_file` and append them
|
|
to the current instance.
|
|
|
|
`source_file` -> Any iterable that yield unicode strings, like a file
|
|
opened with `codecs.open()` or an array of unicode.
|
|
"""
|
|
self.eol = self._guess_eol(source_file)
|
|
self.extend(self.stream(source_file, error_handling=error_handling))
|
|
return self
|
|
|
|
@classmethod
|
|
def stream(cls, source_file, error_handling=ERROR_PASS):
|
|
"""
|
|
stream(source_file, [error_handling])
|
|
|
|
This method yield SubRipItem instances a soon as they have been parsed
|
|
without storing them. It is a kind of SAX parser for .srt files.
|
|
|
|
`source_file` -> Any iterable that yield unicode strings, like a file
|
|
opened with `codecs.open()` or an array of unicode.
|
|
|
|
Example:
|
|
>>> import pysrt
|
|
>>> import codecs
|
|
>>> file = codecs.open('movie.srt', encoding='utf-8')
|
|
>>> for sub in pysrt.stream(file):
|
|
... sub.text += "\nHello !"
|
|
... print unicode(sub)
|
|
"""
|
|
string_buffer = []
|
|
for index, line in enumerate(chain(source_file, '\n')):
|
|
if line.strip():
|
|
string_buffer.append(line)
|
|
else:
|
|
source = string_buffer
|
|
string_buffer = []
|
|
if source and all(source):
|
|
try:
|
|
yield SubRipItem.from_lines(source)
|
|
except Error as error:
|
|
error.args += (''.join(source), )
|
|
cls._handle_error(error, error_handling, index)
|
|
|
|
def save(self, path=None, encoding=None, eol=None):
|
|
"""
|
|
save([path][, encoding][, eol])
|
|
|
|
Use initial path if no other provided.
|
|
Use initial encoding if no other provided.
|
|
Use initial eol if no other provided.
|
|
"""
|
|
path = path or self.path
|
|
encoding = encoding or self.encoding
|
|
|
|
save_file = codecs.open(path, 'w+', encoding=encoding)
|
|
self.write_into(save_file, eol=eol)
|
|
save_file.close()
|
|
|
|
def write_into(self, output_file, eol=None):
|
|
"""
|
|
write_into(output_file [, eol])
|
|
|
|
Serialize current state into `output_file`.
|
|
|
|
`output_file` -> Any instance that respond to `write()`, typically a
|
|
file object
|
|
"""
|
|
output_eol = eol or self.eol
|
|
|
|
for item in self:
|
|
string_repr = str(item)
|
|
if output_eol != '\n':
|
|
string_repr = string_repr.replace('\n', output_eol)
|
|
output_file.write(string_repr)
|
|
# Only add trailing eol if it's not already present.
|
|
# It was kept in the SubRipItem's text before but it really
|
|
# belongs here. Existing applications might give us subtitles
|
|
# which already contain a trailing eol though.
|
|
if not string_repr.endswith(2 * output_eol):
|
|
output_file.write(output_eol)
|
|
|
|
@classmethod
|
|
def _guess_eol(cls, string_iterable):
|
|
first_line = cls._get_first_line(string_iterable)
|
|
for eol in ('\r\n', '\r', '\n'):
|
|
if first_line.endswith(eol):
|
|
return eol
|
|
return os.linesep
|
|
|
|
@classmethod
|
|
def _get_first_line(cls, string_iterable):
|
|
if hasattr(string_iterable, 'tell'):
|
|
previous_position = string_iterable.tell()
|
|
|
|
try:
|
|
first_line = next(iter(string_iterable))
|
|
except StopIteration:
|
|
return ''
|
|
if hasattr(string_iterable, 'seek'):
|
|
string_iterable.seek(previous_position)
|
|
|
|
return first_line
|
|
|
|
@classmethod
|
|
def _detect_encoding(cls, path):
|
|
file_descriptor = open(path, 'rb')
|
|
first_chars = file_descriptor.read(BIGGER_BOM)
|
|
file_descriptor.close()
|
|
|
|
for bom, encoding in BOMS:
|
|
if first_chars.startswith(bom):
|
|
return encoding
|
|
|
|
# TODO: maybe a chardet integration
|
|
return cls.DEFAULT_ENCODING
|
|
|
|
@classmethod
|
|
def _open_unicode_file(cls, path, claimed_encoding=None):
|
|
encoding = claimed_encoding or cls._detect_encoding(path)
|
|
source_file = codecs.open(path, 'rU', encoding=encoding)
|
|
|
|
# get rid of BOM if any
|
|
possible_bom = CODECS_BOMS.get(encoding, None)
|
|
if possible_bom:
|
|
file_bom = source_file.read(len(possible_bom))
|
|
if not file_bom == possible_bom:
|
|
source_file.seek(0) # if not rewind
|
|
return source_file, encoding
|
|
|
|
@classmethod
|
|
def _handle_error(cls, error, error_handling, index):
|
|
if error_handling == cls.ERROR_RAISE:
|
|
error.args = (index, ) + error.args
|
|
raise error
|
|
if error_handling == cls.ERROR_LOG:
|
|
name = type(error).__name__
|
|
sys.stderr.write('PySRT-%s(line %s): \n' % (name, index))
|
|
sys.stderr.write(error.args[0].encode('ascii', 'replace'))
|
|
sys.stderr.write('\n')
|