"""
pygments . util
~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
Utility functions .
: copyright : Copyright 2006 - 2023 by the Pygments team , see AUTHORS .
: license : BSD , see LICENSE for details .
"""
import re
from io import TextIOWrapper
split_path_re = re . compile ( r ' [/ \\ ] ' )
doctype_lookup_re = re . compile ( r '''
< ! DOCTYPE \s + (
[ a - zA - Z_ ] [ a - zA - Z0 - 9 ] *
( ? : \s + # optional in HTML5
[ a - zA - Z_ ] [ a - zA - Z0 - 9 ] * \s +
" [^ " ] * " )?
)
[ ^ > ] * >
''' , re.DOTALL | re.MULTILINE | re.VERBOSE)
tag_re = re . compile ( r ' <(.+?)( \ s.*?)?>.*?</.+?> ' ,
re . IGNORECASE | re . DOTALL | re . MULTILINE )
xml_decl_re = re . compile ( r ' \ s*< \ ?xml[^>]* \ ?> ' , re . I )
class ClassNotFound ( ValueError ) :
""" Raised if one of the lookup functions didn ' t find a matching class. """
class OptionError ( Exception ) :
"""
This exception will be raised by all option processing functions if
the type or value of the argument is not correct .
"""
def get_choice_opt ( options , optname , allowed , default = None , normcase = False ) :
"""
If the key ` optname ` from the dictionary is not in the sequence
` allowed ` , raise an error , otherwise return it .
"""
string = options . get ( optname , default )
if normcase :
string = string . lower ( )
if string not in allowed :
raise OptionError ( ' Value for option %s must be one of %s ' %
( optname , ' , ' . join ( map ( str , allowed ) ) ) )
return string
def get_bool_opt ( options , optname , default = None ) :
"""
Intuitively , this is ` options . get ( optname , default ) ` , but restricted to
Boolean value . The Booleans can be represented as string , in order to accept
Boolean value from the command line arguments . If the key ` optname ` is
present in the dictionary ` options ` and is not associated with a Boolean ,
raise an ` OptionError ` . If it is absent , ` default ` is returned instead .
The valid string values for ` ` True ` ` are ` ` 1 ` ` , ` ` yes ` ` , ` ` true ` ` and
` ` on ` ` , the ones for ` ` False ` ` are ` ` 0 ` ` , ` ` no ` ` , ` ` false ` ` and ` ` off ` `
( matched case - insensitively ) .
"""
string = options . get ( optname , default )
if isinstance ( string , bool ) :
return string
elif isinstance ( string , int ) :
return bool ( string )
elif not isinstance ( string , str ) :
raise OptionError ( ' Invalid type %r for option %s ; use '
' 1/0, yes/no, true/false, on/off ' % (
string , optname ) )
elif string . lower ( ) in ( ' 1 ' , ' yes ' , ' true ' , ' on ' ) :
return True
elif string . lower ( ) in ( ' 0 ' , ' no ' , ' false ' , ' off ' ) :
return False
else :
raise OptionError ( ' Invalid value %r for option %s ; use '
' 1/0, yes/no, true/false, on/off ' % (
string , optname ) )
def get_int_opt ( options , optname , default = None ) :
""" As :func:`get_bool_opt`, but interpret the value as an integer. """
string = options . get ( optname , default )
try :
return int ( string )
except TypeError :
raise OptionError ( ' Invalid type %r for option %s ; you '
' must give an integer value ' % (
string , optname ) )
except ValueError :
raise OptionError ( ' Invalid value %r for option %s ; you '
' must give an integer value ' % (
string , optname ) )
def get_list_opt ( options , optname , default = None ) :
"""
If the key ` optname ` from the dictionary ` options ` is a string ,
split it at whitespace and return it . If it is already a list
or a tuple , it is returned as a list .
"""
val = options . get ( optname , default )
if isinstance ( val , str ) :
return val . split ( )
elif isinstance ( val , ( list , tuple ) ) :
return list ( val )
else :
raise OptionError ( ' Invalid type %r for option %s ; you '
' must give a list value ' % (
val , optname ) )
def docstring_headline ( obj ) :
if not obj . __doc__ :
return ' '
res = [ ]
for line in obj . __doc__ . strip ( ) . splitlines ( ) :
if line . strip ( ) :
res . append ( " " + line . strip ( ) )
else :
break
return ' ' . join ( res ) . lstrip ( )
def make_analysator ( f ) :
""" Return a static text analyser function that returns float values. """
def text_analyse ( text ) :
try :
rv = f ( text )
except Exception :
return 0.0
if not rv :
return 0.0
try :
return min ( 1.0 , max ( 0.0 , float ( rv ) ) )
except ( ValueError , TypeError ) :
return 0.0
text_analyse . __doc__ = f . __doc__
return staticmethod ( text_analyse )
def shebang_matches ( text , regex ) :
r """ Check if the given regular expression matches the last part of the
shebang if one exists .
>> > from pygments . util import shebang_matches
>> > shebang_matches ( ' #!/usr/bin/env python ' , r ' python(2 \ . \ d)? ' )
True
>> > shebang_matches ( ' #!/usr/bin/python2.4 ' , r ' python(2 \ . \ d)? ' )
True
>> > shebang_matches ( ' #!/usr/bin/python-ruby ' , r ' python(2 \ . \ d)? ' )
False
>> > shebang_matches ( ' #!/usr/bin/python/ruby ' , r ' python(2 \ . \ d)? ' )
False
>> > shebang_matches ( ' #!/usr/bin/startsomethingwith python ' ,
. . . r ' python(2 \ . \ d)? ' )
True
It also checks for common windows executable file extensions : :
>> > shebang_matches ( ' #!C: \\ Python2.4 \\ Python.exe ' , r ' python(2 \ . \ d)? ' )
True
Parameters ( ` ` ' -f ' ` ` or ` ` ' --foo ' ` ` are ignored so ` ` ' perl ' ` ` does
the same as ` ` ' perl -e ' ` ` )
Note that this method automatically searches the whole string ( eg :
the regular expression is wrapped in ` ` ' ^$ ' ` ` )
"""
index = text . find ( ' \n ' )
if index > = 0 :
first_line = text [ : index ] . lower ( )
else :
first_line = text . lower ( )
if first_line . startswith ( ' #! ' ) :
try :
found = [ x for x in split_path_re . split ( first_line [ 2 : ] . strip ( ) )
if x and not x . startswith ( ' - ' ) ] [ - 1 ]
except IndexError :
return False
regex = re . compile ( r ' ^ %s ( \ .(exe|cmd|bat|bin))?$ ' % regex , re . IGNORECASE )
if regex . search ( found ) is not None :
return True
return False
def doctype_matches ( text , regex ) :
""" Check if the doctype matches a regular expression (if present).
Note that this method only checks the first part of a DOCTYPE .
eg : ' html PUBLIC " -//W3C//DTD XHTML 1.0 Strict//EN " '
"""
m = doctype_lookup_re . search ( text )
if m is None :
return False
doctype = m . group ( 1 )
return re . compile ( regex , re . I ) . match ( doctype . strip ( ) ) is not None
def html_doctype_matches ( text ) :
""" Check if the file looks like it has a html doctype. """
return doctype_matches ( text , r ' html ' )
_looks_like_xml_cache = { }
def looks_like_xml ( text ) :
""" Check if a doctype exists or if we have some tags. """
if xml_decl_re . match ( text ) :
return True
key = hash ( text )
try :
return _looks_like_xml_cache [ key ]
except KeyError :
m = doctype_lookup_re . search ( text )
if m is not None :
return True
rv = tag_re . search ( text [ : 1000 ] ) is not None
_looks_like_xml_cache [ key ] = rv
return rv
def surrogatepair ( c ) :
""" Given a unicode character code with length greater than 16 bits,
return the two 16 bit surrogate pair .
"""
# From example D28 of:
# http://www.unicode.org/book/ch03.pdf
return ( 0xd7c0 + ( c >> 10 ) , ( 0xdc00 + ( c & 0x3ff ) ) )
def format_lines ( var_name , seq , raw = False , indent_level = 0 ) :
""" Formats a sequence of strings for output. """
lines = [ ]
base_indent = ' ' * indent_level * 4
inner_indent = ' ' * ( indent_level + 1 ) * 4
lines . append ( base_indent + var_name + ' = ( ' )
if raw :
# These should be preformatted reprs of, say, tuples.
for i in seq :
lines . append ( inner_indent + i + ' , ' )
else :
for i in seq :
# Force use of single quotes
r = repr ( i + ' " ' )
lines . append ( inner_indent + r [ : - 2 ] + r [ - 1 ] + ' , ' )
lines . append ( base_indent + ' ) ' )
return ' \n ' . join ( lines )
def duplicates_removed ( it , already_seen = ( ) ) :
"""
Returns a list with duplicates removed from the iterable ` it ` .
Order is preserved .
"""
lst = [ ]
seen = set ( )
for i in it :
if i in seen or i in already_seen :
continue
lst . append ( i )
seen . add ( i )
return lst
class Future :
""" Generic class to defer some work.
Handled specially in RegexLexerMeta , to support regex string construction at
first use .
"""
def get ( self ) :
raise NotImplementedError
def guess_decode ( text ) :
""" Decode *text* with guessed encoding.
First try UTF - 8 ; this should fail for non - UTF - 8 encodings .
Then try the preferred locale encoding .
Fall back to latin - 1 , which always works .
"""
try :
text = text . decode ( ' utf-8 ' )
return text , ' utf-8 '
except UnicodeDecodeError :
try :
import locale
prefencoding = locale . getpreferredencoding ( )
text = text . decode ( )
return text , prefencoding
except ( UnicodeDecodeError , LookupError ) :
text = text . decode ( ' latin1 ' )
return text , ' latin1 '
def guess_decode_from_terminal ( text , term ) :
""" Decode *text* coming from terminal *term*.
First try the terminal encoding , if given .
Then try UTF - 8. Then try the preferred locale encoding .
Fall back to latin - 1 , which always works .
"""
if getattr ( term , ' encoding ' , None ) :
try :
text = text . decode ( term . encoding )
except UnicodeDecodeError :
pass
else :
return text , term . encoding
return guess_decode ( text )
def terminal_encoding ( term ) :
""" Return our best guess of encoding for the given *term*. """
if getattr ( term , ' encoding ' , None ) :
return term . encoding
import locale
return locale . getpreferredencoding ( )
class UnclosingTextIOWrapper ( TextIOWrapper ) :
# Don't close underlying buffer on destruction.
def close ( self ) :
self . flush ( )