You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
212 lines
6.6 KiB
212 lines
6.6 KiB
from pyjsparser.pyjsparserdata import *
|
|
|
|
REGEXP_SPECIAL_SINGLE = {'\\', '^', '$', '*', '+', '?', '.'}
|
|
|
|
NOT_PATTERN_CHARS = {
|
|
'^', '$', '\\', '.', '*', '+', '?', '(', ')', '[', ']', '|'
|
|
} # what about '{', '}', ???
|
|
|
|
CHAR_CLASS_ESCAPE = {'d', 'D', 's', 'S', 'w', 'W'}
|
|
CONTROL_ESCAPE_CHARS = {'f', 'n', 'r', 't', 'v'}
|
|
CONTROL_LETTERS = {
|
|
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
|
|
'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D',
|
|
'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S',
|
|
'T', 'U', 'V', 'W', 'X', 'Y', 'Z'
|
|
}
|
|
|
|
|
|
def SpecialChar(char):
|
|
return {'type': 'SpecialChar', 'content': char}
|
|
|
|
|
|
def isPatternCharacter(char):
|
|
return char not in NOT_PATTERN_CHARS
|
|
|
|
|
|
class JsRegExpParser:
|
|
def __init__(self, source, flags):
|
|
self.source = source
|
|
self.flags = flags
|
|
self.index = 0
|
|
self.length = len(source)
|
|
self.lineNumber = 0
|
|
self.lineStart = 0
|
|
|
|
def parsePattern(self):
|
|
'''Perform sctring escape - for regexp literals'''
|
|
return {'type': 'Pattern', 'contents': self.parseDisjunction()}
|
|
|
|
def parseDisjunction(self):
|
|
alternatives = []
|
|
while True:
|
|
alternatives.append(self.parseAlternative())
|
|
if not self.isEOF():
|
|
self.expect_character('|')
|
|
else:
|
|
break
|
|
return {'type': 'Disjunction', 'contents': alternatives}
|
|
|
|
def isEOF(self):
|
|
if self.index >= self.length:
|
|
return True
|
|
return False
|
|
|
|
def expect_character(self, character):
|
|
if self.source[self.index] != character:
|
|
self.throwUnexpected(character)
|
|
self.index += 1
|
|
|
|
def parseAlternative(self):
|
|
contents = []
|
|
while not self.isEOF() and self.source[self.index] != '|':
|
|
contents.append(self.parseTerm())
|
|
return {'type': 'Alternative', 'contents': contents}
|
|
|
|
def follows(self, chars):
|
|
for i, c in enumerate(chars):
|
|
if self.index + i >= self.length or self.source[self.index +
|
|
i] != c:
|
|
return False
|
|
return True
|
|
|
|
def parseTerm(self):
|
|
assertion = self.parseAssertion()
|
|
if assertion:
|
|
return assertion
|
|
else:
|
|
return {
|
|
'type': 'Term',
|
|
'contents': self.parseAtom()
|
|
} # quantifier will go inside atom!
|
|
|
|
def parseAssertion(self):
|
|
if self.follows('$'):
|
|
content = SpecialChar('$')
|
|
self.index += 1
|
|
elif self.follows('^'):
|
|
content = SpecialChar('^')
|
|
self.index += 1
|
|
elif self.follows('\\b'):
|
|
content = SpecialChar('\\b')
|
|
self.index += 2
|
|
elif self.follows('\\B'):
|
|
content = SpecialChar('\\B')
|
|
self.index += 2
|
|
elif self.follows('(?='):
|
|
self.index += 3
|
|
dis = self.parseDisjunction()
|
|
self.expect_character(')')
|
|
content = {'type': 'Lookached', 'contents': dis, 'negated': False}
|
|
elif self.follows('(?!'):
|
|
self.index += 3
|
|
dis = self.parseDisjunction()
|
|
self.expect_character(')')
|
|
content = {'type': 'Lookached', 'contents': dis, 'negated': True}
|
|
else:
|
|
return None
|
|
return {'type': 'Assertion', 'content': content}
|
|
|
|
def parseAtom(self):
|
|
if self.follows('.'):
|
|
content = SpecialChar('.')
|
|
self.index += 1
|
|
elif self.follows('\\'):
|
|
self.index += 1
|
|
content = self.parseAtomEscape()
|
|
elif self.follows('['):
|
|
content = self.parseCharacterClass()
|
|
elif self.follows('(?:'):
|
|
self.index += 3
|
|
dis = self.parseDisjunction()
|
|
self.expect_character(')')
|
|
content = 'idk'
|
|
elif self.follows('('):
|
|
self.index += 1
|
|
dis = self.parseDisjunction()
|
|
self.expect_character(')')
|
|
content = 'idk'
|
|
elif isPatternCharacter(self.source[self.index]):
|
|
content = self.source[self.index]
|
|
self.index += 1
|
|
else:
|
|
return None
|
|
quantifier = self.parseQuantifier()
|
|
return {'type': 'Atom', 'content': content, 'quantifier': quantifier}
|
|
|
|
def parseQuantifier(self):
|
|
prefix = self.parseQuantifierPrefix()
|
|
if not prefix:
|
|
return None
|
|
greedy = True
|
|
if self.follows('?'):
|
|
self.index += 1
|
|
greedy = False
|
|
return {'type': 'Quantifier', 'contents': prefix, 'greedy': greedy}
|
|
|
|
def parseQuantifierPrefix(self):
|
|
if self.isEOF():
|
|
return None
|
|
if self.follows('+'):
|
|
content = '+'
|
|
self.index += 1
|
|
elif self.follows('?'):
|
|
content = '?'
|
|
self.index += 1
|
|
elif self.follows('*'):
|
|
content = '*'
|
|
self.index += 1
|
|
elif self.follows(
|
|
'{'
|
|
): # try matching otherwise return None and restore the state
|
|
i = self.index
|
|
self.index += 1
|
|
digs1 = self.scanDecimalDigs()
|
|
# if no minimal number of digs provided then return no quantifier
|
|
if not digs1:
|
|
self.index = i
|
|
return None
|
|
# scan char limit if provided
|
|
if self.follows(','):
|
|
self.index += 1
|
|
digs2 = self.scanDecimalDigs()
|
|
else:
|
|
digs2 = ''
|
|
# must be valid!
|
|
if not self.follows('}'):
|
|
self.index = i
|
|
return None
|
|
else:
|
|
self.expect_character('}')
|
|
content = int(digs1), int(digs2) if digs2 else None
|
|
else:
|
|
return None
|
|
return content
|
|
|
|
def parseAtomEscape(self):
|
|
ch = self.source[self.index]
|
|
if isDecimalDigit(ch) and ch != 0:
|
|
digs = self.scanDecimalDigs()
|
|
elif ch in CHAR_CLASS_ESCAPE:
|
|
self.index += 1
|
|
return SpecialChar('\\' + ch)
|
|
else:
|
|
return self.parseCharacterEscape()
|
|
|
|
def parseCharacterEscape(self):
|
|
ch = self.source[self.index]
|
|
if ch in CONTROL_ESCAPE_CHARS:
|
|
return SpecialChar('\\' + ch)
|
|
if ch == 'c':
|
|
'ok, fuck this shit.'
|
|
|
|
def scanDecimalDigs(self):
|
|
s = self.index
|
|
while not self.isEOF() and isDecimalDigit(self.source[self.index]):
|
|
self.index += 1
|
|
return self.source[s:self.index]
|
|
|
|
|
|
a = JsRegExpParser('a(?=x)', '')
|
|
print(a.parsePattern())
|