""" pygments.lexers.textfmts ~~~~~~~~~~~~~~~~~~~~~~~~ Lexers for various text formats. :copyright: Copyright 2006-2022 by the Pygments team, see AUTHORS. :license: BSD, see LICENSE for details. """ import re from pygments.lexers import guess_lexer, get_lexer_by_name from pygments.lexer import RegexLexer, bygroups, default, include from pygments.token import Text, Comment, Operator, Keyword, Name, String, \ Number, Generic, Literal, Punctuation from pygments.util import ClassNotFound __all__ = ['IrcLogsLexer', 'TodotxtLexer', 'HttpLexer', 'GettextLexer', 'NotmuchLexer', 'KernelLogLexer'] class IrcLogsLexer(RegexLexer): """ Lexer for IRC logs in *irssi*, *xchat* or *weechat* style. """ name = 'IRC logs' aliases = ['irc'] filenames = ['*.weechatlog'] mimetypes = ['text/x-irclog'] flags = re.VERBOSE | re.MULTILINE timestamp = r""" ( # irssi / xchat and others (?: \[|\()? # Opening bracket or paren for the timestamp (?: # Timestamp (?: (?:\d{1,4} [-/])* # Date as - or /-separated groups of digits (?:\d{1,4}) [T ])? # Date/time separator: T or space (?: \d?\d [:.])* # Time as :/.-separated groups of 1 or 2 digits (?: \d?\d) ) (?: \]|\))?\s+ # Closing bracket or paren for the timestamp | # weechat \d{4}\s\w{3}\s\d{2}\s # Date \d{2}:\d{2}:\d{2}\s+ # Time + Whitespace | # xchat \w{3}\s\d{2}\s # Date \d{2}:\d{2}:\d{2}\s+ # Time + Whitespace )? """ tokens = { 'root': [ # log start/end (r'^\*\*\*\*(.*)\*\*\*\*$', Comment), # hack ("^" + timestamp + r'(\s*<[^>]*>\s*)$', bygroups(Comment.Preproc, Name.Tag)), # normal msgs ("^" + timestamp + r""" (\s*<.*?>\s*) # Nick """, bygroups(Comment.Preproc, Name.Tag), 'msg'), # /me msgs ("^" + timestamp + r""" (\s*[*]\s+) # Star (\S+\s+.*?\n) # Nick + rest of message """, bygroups(Comment.Preproc, Keyword, Generic.Inserted)), # join/part msgs ("^" + timestamp + r""" (\s*(?:\*{3}|?)\s*) # Star(s) or symbols (\S+\s+) # Nick + Space (.*?\n) # Rest of message """, bygroups(Comment.Preproc, Keyword, String, Comment)), (r"^.*?\n", Text), ], 'msg': [ (r"\S+:(?!//)", Name.Attribute), # Prefix (r".*\n", Text, '#pop'), ], } class GettextLexer(RegexLexer): """ Lexer for Gettext catalog files. .. versionadded:: 0.9 """ name = 'Gettext Catalog' aliases = ['pot', 'po'] filenames = ['*.pot', '*.po'] mimetypes = ['application/x-gettext', 'text/x-gettext', 'text/gettext'] tokens = { 'root': [ (r'^#,\s.*?$', Keyword.Type), (r'^#:\s.*?$', Keyword.Declaration), # (r'^#$', Comment), (r'^(#|#\.\s|#\|\s|#~\s|#\s).*$', Comment.Single), (r'^(")([A-Za-z-]+:)(.*")$', bygroups(String, Name.Property, String)), (r'^".*"$', String), (r'^(msgid|msgid_plural|msgstr|msgctxt)(\s+)(".*")$', bygroups(Name.Variable, Text, String)), (r'^(msgstr\[)(\d)(\])(\s+)(".*")$', bygroups(Name.Variable, Number.Integer, Name.Variable, Text, String)), ] } class HttpLexer(RegexLexer): """ Lexer for HTTP sessions. .. versionadded:: 1.5 """ name = 'HTTP' aliases = ['http'] flags = re.DOTALL def get_tokens_unprocessed(self, text, stack=('root',)): """Reset the content-type state.""" self.content_type = None return RegexLexer.get_tokens_unprocessed(self, text, stack) def header_callback(self, match): if match.group(1).lower() == 'content-type': content_type = match.group(5).strip() if ';' in content_type: content_type = content_type[:content_type.find(';')].strip() self.content_type = content_type yield match.start(1), Name.Attribute, match.group(1) yield match.start(2), Text, match.group(2) yield match.start(3), Operator, match.group(3) yield match.start(4), Text, match.group(4) yield match.start(5), Literal, match.group(5) yield match.start(6), Text, match.group(6) def continuous_header_callback(self, match): yield match.start(1), Text, match.group(1) yield match.start(2), Literal, match.group(2) yield match.start(3), Text, match.group(3) def content_callback(self, match): content_type = getattr(self, 'content_type', None) content = match.group() offset = match.start() if content_type: from pygments.lexers import get_lexer_for_mimetype possible_lexer_mimetypes = [content_type] if '+' in content_type: # application/calendar+xml can be treated as application/xml # if there's not a better match. general_type = re.sub(r'^(.*)/.*\+(.*)$', r'\1/\2', content_type) possible_lexer_mimetypes.append(general_type) for i in possible_lexer_mimetypes: try: lexer = get_lexer_for_mimetype(i) except ClassNotFound: pass else: for idx, token, value in lexer.get_tokens_unprocessed(content): yield offset + idx, token, value return yield offset, Text, content tokens = { 'root': [ (r'(GET|POST|PUT|DELETE|HEAD|OPTIONS|TRACE|PATCH)( +)([^ ]+)( +)' r'(HTTP)(/)(1\.[01]|2(?:\.0)?|3)(\r?\n|\Z)', bygroups(Name.Function, Text, Name.Namespace, Text, Keyword.Reserved, Operator, Number, Text), 'headers'), (r'(HTTP)(/)(1\.[01]|2(?:\.0)?|3)( +)(\d{3})(?:( +)([^\r\n]*))?(\r?\n|\Z)', bygroups(Keyword.Reserved, Operator, Number, Text, Number, Text, Name.Exception, Text), 'headers'), ], 'headers': [ (r'([^\s:]+)( *)(:)( *)([^\r\n]+)(\r?\n|\Z)', header_callback), (r'([\t ]+)([^\r\n]+)(\r?\n|\Z)', continuous_header_callback), (r'\r?\n', Text, 'content') ], 'content': [ (r'.+', content_callback) ] } def analyse_text(text): return text.startswith(('GET /', 'POST /', 'PUT /', 'DELETE /', 'HEAD /', 'OPTIONS /', 'TRACE /', 'PATCH /')) class TodotxtLexer(RegexLexer): """ Lexer for Todo.txt todo list format. .. versionadded:: 2.0 """ name = 'Todotxt' url = 'http://todotxt.com/' aliases = ['todotxt'] # *.todotxt is not a standard extension for Todo.txt files; including it # makes testing easier, and also makes autodetecting file type easier. filenames = ['todo.txt', '*.todotxt'] mimetypes = ['text/x-todo'] # Aliases mapping standard token types of Todo.txt format concepts CompleteTaskText = Operator # Chosen to de-emphasize complete tasks IncompleteTaskText = Text # Incomplete tasks should look like plain text # Priority should have most emphasis to indicate importance of tasks Priority = Generic.Heading # Dates should have next most emphasis because time is important Date = Generic.Subheading # Project and context should have equal weight, and be in different colors Project = Generic.Error Context = String # If tag functionality is added, it should have the same weight as Project # and Context, and a different color. Generic.Traceback would work well. # Regex patterns for building up rules; dates, priorities, projects, and # contexts are all atomic # TODO: Make date regex more ISO 8601 compliant date_regex = r'\d{4,}-\d{2}-\d{2}' priority_regex = r'\([A-Z]\)' project_regex = r'\+\S+' context_regex = r'@\S+' # Compound regex expressions complete_one_date_regex = r'(x )(' + date_regex + r')' complete_two_date_regex = (complete_one_date_regex + r'( )(' + date_regex + r')') priority_date_regex = r'(' + priority_regex + r')( )(' + date_regex + r')' tokens = { # Should parse starting at beginning of line; each line is a task 'root': [ # Complete task entry points: two total: # 1. Complete task with two dates (complete_two_date_regex, bygroups(CompleteTaskText, Date, CompleteTaskText, Date), 'complete'), # 2. Complete task with one date (complete_one_date_regex, bygroups(CompleteTaskText, Date), 'complete'), # Incomplete task entry points: six total: # 1. Priority plus date (priority_date_regex, bygroups(Priority, IncompleteTaskText, Date), 'incomplete'), # 2. Priority only (priority_regex, Priority, 'incomplete'), # 3. Leading date (date_regex, Date, 'incomplete'), # 4. Leading context (context_regex, Context, 'incomplete'), # 5. Leading project (project_regex, Project, 'incomplete'), # 6. Non-whitespace catch-all (r'\S+', IncompleteTaskText, 'incomplete'), ], # Parse a complete task 'complete': [ # Newline indicates end of task, should return to root (r'\s*\n', CompleteTaskText, '#pop'), # Tokenize contexts and projects (context_regex, Context), (project_regex, Project), # Tokenize non-whitespace text (r'\S+', CompleteTaskText), # Tokenize whitespace not containing a newline (r'\s+', CompleteTaskText), ], # Parse an incomplete task 'incomplete': [ # Newline indicates end of task, should return to root (r'\s*\n', IncompleteTaskText, '#pop'), # Tokenize contexts and projects (context_regex, Context), (project_regex, Project), # Tokenize non-whitespace text (r'\S+', IncompleteTaskText), # Tokenize whitespace not containing a newline (r'\s+', IncompleteTaskText), ], } class NotmuchLexer(RegexLexer): """ For Notmuch email text format. .. versionadded:: 2.5 Additional options accepted: `body_lexer` If given, highlight the contents of the message body with the specified lexer, else guess it according to the body content (default: ``None``). """ name = 'Notmuch' url = 'https://notmuchmail.org/' aliases = ['notmuch'] def _highlight_code(self, match): code = match.group(1) try: if self.body_lexer: lexer = get_lexer_by_name(self.body_lexer) else: lexer = guess_lexer(code.strip()) except ClassNotFound: lexer = get_lexer_by_name('text') yield from lexer.get_tokens_unprocessed(code) tokens = { 'root': [ (r'\fmessage\{\s*', Keyword, ('message', 'message-attr')), ], 'message-attr': [ (r'(\s*id:\s*)(\S+)', bygroups(Name.Attribute, String)), (r'(\s*(?:depth|match|excluded):\s*)(\d+)', bygroups(Name.Attribute, Number.Integer)), (r'(\s*filename:\s*)(.+\n)', bygroups(Name.Attribute, String)), default('#pop'), ], 'message': [ (r'\fmessage\}\n', Keyword, '#pop'), (r'\fheader\{\n', Keyword, 'header'), (r'\fbody\{\n', Keyword, 'body'), ], 'header': [ (r'\fheader\}\n', Keyword, '#pop'), (r'((?:Subject|From|To|Cc|Date):\s*)(.*\n)', bygroups(Name.Attribute, String)), (r'(.*)(\s*\(.*\))(\s*\(.*\)\n)', bygroups(Generic.Strong, Literal, Name.Tag)), ], 'body': [ (r'\fpart\{\n', Keyword, 'part'), (r'\f(part|attachment)\{\s*', Keyword, ('part', 'part-attr')), (r'\fbody\}\n', Keyword, '#pop'), ], 'part-attr': [ (r'(ID:\s*)(\d+)', bygroups(Name.Attribute, Number.Integer)), (r'(,\s*)((?:Filename|Content-id):\s*)([^,]+)', bygroups(Punctuation, Name.Attribute, String)), (r'(,\s*)(Content-type:\s*)(.+\n)', bygroups(Punctuation, Name.Attribute, String)), default('#pop'), ], 'part': [ (r'\f(?:part|attachment)\}\n', Keyword, '#pop'), (r'\f(?:part|attachment)\{\s*', Keyword, ('#push', 'part-attr')), (r'^Non-text part: .*\n', Comment), (r'(?s)(.*?(?=\f(?:part|attachment)\}\n))', _highlight_code), ], } def analyse_text(text): return 1.0 if text.startswith('\fmessage{') else 0.0 def __init__(self, **options): self.body_lexer = options.get('body_lexer', None) RegexLexer.__init__(self, **options) class KernelLogLexer(RegexLexer): """ For Linux Kernel log ("dmesg") output. .. versionadded:: 2.6 """ name = 'Kernel log' aliases = ['kmsg', 'dmesg'] filenames = ['*.kmsg', '*.dmesg'] tokens = { 'root': [ (r'^[^:]+:debug : (?=\[)', Text, 'debug'), (r'^[^:]+:info : (?=\[)', Text, 'info'), (r'^[^:]+:warn : (?=\[)', Text, 'warn'), (r'^[^:]+:notice: (?=\[)', Text, 'warn'), (r'^[^:]+:err : (?=\[)', Text, 'error'), (r'^[^:]+:crit : (?=\[)', Text, 'error'), (r'^(?=\[)', Text, 'unknown'), ], 'unknown': [ (r'^(?=.+(warning|notice|audit|deprecated))', Text, 'warn'), (r'^(?=.+(error|critical|fail|Bug))', Text, 'error'), default('info'), ], 'base': [ (r'\[[0-9. ]+\] ', Number), (r'(?<=\] ).+?:', Keyword), (r'\n', Text, '#pop'), ], 'debug': [ include('base'), (r'.+\n', Comment, '#pop') ], 'info': [ include('base'), (r'.+\n', Text, '#pop') ], 'warn': [ include('base'), (r'.+\n', Generic.Strong, '#pop') ], 'error': [ include('base'), (r'.+\n', Generic.Error, '#pop') ] }