You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
89 lines
3.5 KiB
89 lines
3.5 KiB
######################## BEGIN LICENSE BLOCK ########################
|
|
# The Original Code is mozilla.org code.
|
|
#
|
|
# The Initial Developer of the Original Code is
|
|
# Netscape Communications Corporation.
|
|
# Portions created by the Initial Developer are Copyright (C) 1998
|
|
# the Initial Developer. All Rights Reserved.
|
|
#
|
|
# Contributor(s):
|
|
# Mark Pilgrim - port to Python
|
|
#
|
|
# This library is free software; you can redistribute it and/or
|
|
# modify it under the terms of the GNU Lesser General Public
|
|
# License as published by the Free Software Foundation; either
|
|
# version 2.1 of the License, or (at your option) any later version.
|
|
#
|
|
# This library is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
# Lesser General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU Lesser General Public
|
|
# License along with this library; if not, write to the Free Software
|
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
|
# 02110-1301 USA
|
|
######################### END LICENSE BLOCK #########################
|
|
|
|
import logging
|
|
|
|
from .enums import MachineState
|
|
|
|
|
|
class CodingStateMachine:
|
|
"""
|
|
A state machine to verify a byte sequence for a particular encoding. For
|
|
each byte the detector receives, it will feed that byte to every active
|
|
state machine available, one byte at a time. The state machine changes its
|
|
state based on its previous state and the byte it receives. There are 3
|
|
states in a state machine that are of interest to an auto-detector:
|
|
|
|
START state: This is the state to start with, or a legal byte sequence
|
|
(i.e. a valid code point) for character has been identified.
|
|
|
|
ME state: This indicates that the state machine identified a byte sequence
|
|
that is specific to the charset it is designed for and that
|
|
there is no other possible encoding which can contain this byte
|
|
sequence. This will to lead to an immediate positive answer for
|
|
the detector.
|
|
|
|
ERROR state: This indicates the state machine identified an illegal byte
|
|
sequence for that encoding. This will lead to an immediate
|
|
negative answer for this encoding. Detector will exclude this
|
|
encoding from consideration from here on.
|
|
"""
|
|
|
|
def __init__(self, sm):
|
|
self._model = sm
|
|
self._curr_byte_pos = 0
|
|
self._curr_char_len = 0
|
|
self._curr_state = None
|
|
self.logger = logging.getLogger(__name__)
|
|
self.reset()
|
|
|
|
def reset(self):
|
|
self._curr_state = MachineState.START
|
|
|
|
def next_state(self, c):
|
|
# for each byte we get its class
|
|
# if it is first byte, we also get byte length
|
|
byte_class = self._model["class_table"][c]
|
|
if self._curr_state == MachineState.START:
|
|
self._curr_byte_pos = 0
|
|
self._curr_char_len = self._model["char_len_table"][byte_class]
|
|
# from byte's class and state_table, we get its next state
|
|
curr_state = self._curr_state * self._model["class_factor"] + byte_class
|
|
self._curr_state = self._model["state_table"][curr_state]
|
|
self._curr_byte_pos += 1
|
|
return self._curr_state
|
|
|
|
def get_current_charlen(self):
|
|
return self._curr_char_len
|
|
|
|
def get_coding_state_machine(self):
|
|
return self._model["name"]
|
|
|
|
@property
|
|
def language(self):
|
|
return self._model["language"]
|