我们从Python开源项目中,提取了以下15个代码示例,用于说明如何使用pygments.lex()。
def highlight(self, block): """Method called on each block to highlight it content""" tokens = pygments.lex(block, self.python_lexer) if self.format_rst: from pygments.token import Token toks = [] for token in tokens: if token[0] == Token.String.Doc and len(token[1]) > 6: toks += pygments.lex(token[1][:3], self.python_lexer) # parse doc string content by rst lexer toks += pygments.lex(token[1][3:-3], self.rst_lexer) toks += pygments.lex(token[1][-3:], self.python_lexer) elif token[0] == Token.Comment.Single: toks.append((Token.Comment.Single, token[1][0])) # parse comment content by rst lexer # remove the extrat newline added by rst lexer toks += list(pygments.lex(token[1][1:], self.rst_lexer))[:-1] else: toks.append(token) tokens = toks return pygments.format(tokens, self.formatter)
def fix_preprocessor_defs(tokens, lexer): res = [] for t in tokens: token_split = t[1].split() if not is_token_subtype(t[0], Token.Literal.String) and len(token_split) > 1: if t[0] == Token.Comment.PreprocFile: if t[1].startswith('"'): end = t[1].find('"', t[1].find('"') + 1) + 1 elif t[1].startswith('<'): end = t[1].find('>') + 1 else: end = t[1].find(' ') + 1 res.append((t[0], t[1][:end])) else: token_lexed = list(lex(' '.join(token_split), lexer)) res += token_lexed else: res += (t,) return res
def __iter__(self): """parse code string and yield "clasified" tokens """ try: tokens = self.lex() except IOError: log.info("Pygments lexer not found, using fallback") # TODO: write message to INFO yield ('', self.code) return for ttype, value in self.join(tokens): yield (_get_ttype_class(ttype), value) # code_block_directive # -------------------- # ::
def __iter__(self): """Parse self.code and yield "classified" tokens. """ if self.lexer is None: yield ([], self.code) return tokens = pygments.lex(self.code, self.lexer) for tokentype, value in self.merge(tokens): if self.tokennames == 'long': # long CSS class args classes = str(tokentype).lower().split('.') else: # short CSS class args classes = [_get_ttype_class(tokentype)] classes = [cls for cls in classes if cls not in unstyled_tokens] yield (classes, value)
def tokenize_code(code, lexer, language, literal_option): tokens = lex(code, lexer) tokensList = list(tokens) # Strip comments and alter strings lexedWoComments = tokensExceptTokenType(tokensList, Token.Comment, retainedTypes=[Token.Comment.Preproc, Token.Comment.PreprocFile]) lexedWoComments = tokensExceptTokenType(lexedWoComments, Token.Literal.String.Doc) lexedWoComments = fixTypes(lexedWoComments, language) #Alter the pygments lexer types to be more comparable between our languages lexedWoComments = convertNamespaceTokens(lexedWoComments, language) lexedWoComments = fix_preprocessor_defs(lexedWoComments, lexer) lexedWoComments = tokensExceptTokenType(lexedWoComments, Token.Comment, retainedTypes=[Token.Comment.Preproc, Token.Comment.PreprocFile]) if(literal_option == 0): lexedWoComments = modifyStrings(lexedWoComments, underscoreString) elif(literal_option == 1): lexedWoComments = modifyStrings(lexedWoComments, singleStringToken) elif(literal_option == 2): lexedWoComments = modifyStrings(lexedWoComments, spaceString) elif(literal_option == 3): lexedWoComments = modifyStrings(lexedWoComments, singleStringToken) lexedWoComments = collapseStrings(lexedWoComments) lexedWoComments = modifyNumbers(lexedWoComments, singleNumberToken) return get_tokenization(lexedWoComments, lexer) # source_file: path of source file to be tokenized # language: programming language of source file, e.g. "c" # literal_option: # 0 -> replace all spaces in strings with _ # 1 -> replace all strings with a <str> tag # 2 -> add spaces to the ends of the strings # 3 -> collapse strings to <str> and collapses numbers to a type as well.
def lex(self): # Get lexer for language (use text as fallback) try: if self.language and unicode(self.language).lower() <> 'none': lexer = get_lexer_by_name(self.language.lower(), **self.custom_args ) else: lexer = get_lexer_by_name('text', **self.custom_args) except ValueError: log.info("no pygments lexer for %s, using 'text'" \ % self.language) # what happens if pygment isn't present ? lexer = get_lexer_by_name('text') return pygments.lex(self.code, lexer)
def format_testcase_diff(diff): """Format a testcase output diff. PARAMETERS diff: the diff content RETURNS a list of pygments' Tokens """ def new_line_token(): """Generate a new line token.""" return Token.Whitespace, '\n' def indent_token(): """Generate an indentation space token.""" return Token.Whitespace, ' ' * 4 tokens = [] new_line = True # Because of logging prefixes, skip the first line to avoid # misalignment. tokens.append(new_line_token()) for ttype, value in pygments.lex(diff, DiffLexer()): for subval in value.split('\n'): if new_line: tokens.append(indent_token()) new_line = not subval if subval: tokens.append((ttype, subval)) else: tokens.append(new_line_token()) return tokens
def tokens(self, event=None): """ Highlight tokens as rendered by Pygments. Seems to only work after textarea is updated, though calling update_idletasks has no effect. The problem can be solved by recalling the function if there is no bbox, (as with update_linenumbers), or figure out what is not updated when running this function (bbox was the case in update_linenumbers). """ # http://stackoverflow.com/a/30199105 from pygments import lex, highlight from pygments.lexers import PythonLexer from pygments.formatters import HtmlFormatter # don't use because multiline strings can start at beginning and end in visible view #tv = self.mainframe.texthelper.top_visible(self.textarea) # use since highlight works if multiline str not properly closed bv = self.mainframe.texthelper.bottom_visible(self.textarea) data = self.textarea.get("1.0", bv) # "end-1c" if data == self.prevdata: return self.clear_tokens() #print( highlight(data, PythonLexer(), HtmlFormatter())) prev_content = '' i = 0 for token, content in lex(data, PythonLexer()): lencontent = len(content) # this happens sometimes in lubuntu if not content: #print('no content in HighLight.tokens() loop') continue #str(token) == 'Token.Literal.String.Doc' \ if self.mainframe.texthelper.visible(self.textarea, '1.0 + %dc' % i) \ or self.mainframe.texthelper.visible(self.textarea, '1.0 + %dc' % (i+lencontent)): self.textarea.mark_set("range_start", "1.0 + %dc" %i ) self.textarea.mark_set("range_end", "range_start + %dc" % lencontent) self.textarea.tag_add(str(token), "range_start", "range_end") i += lencontent self.prevdata = data
def get_tokenization(lexedWoComments, lexer): tokenized_string = '' token_types = [] curr_line_empty = True for t in lexedWoComments: token_type = str(t[0]) token = t[1] token_stripped = token.strip() # Pygments will sometimes lex many tokens as one # This can occur with preprocessor directives and definitions in C # In this case, we need to lex that whole line num_tokens = len(token.split()) if num_tokens > 1: # Need to manually lex each space seperated token on occassions # when pygments doesn't lex properly line_split = token.split() line_lexed = [] for temp_token in line_split: token_lexed = list(lex(temp_token, lexer)) for lexed in token_lexed: if lexed[1] != "\n": line_lexed.append(lexed) line_lexed.append((Token.Text, '\n')) line_code, line_types = get_tokenization(line_lexed, lexer) tokenized_string += line_code token_types += line_types curr_line_empty = True continue if '\n' in token: if curr_line_empty: if (t[0] != Token.Text or t[0] != Token.Comment.Preproc) and token_stripped != '': tokenized_string += token_stripped + "\n" token_types.append(token_type) else: tokenized_string += token_stripped + "\n" # Edge case for stray "\" in code if token_stripped == "\\": token_types.append(token_type) curr_line_empty = True elif t[0] != Token.Text and len(token_stripped) > 0: curr_line_empty = False tokenized_string += token + ' ' token_types.append(token_type) assert len(tokenized_string.split()) == len(token_types), "{0} != {1}".format(len(tokenized_string.split()), len(token_types)) return tokenized_string, token_types