我们从Python开源项目中,提取了以下30个代码示例,用于说明如何使用re.html()。
def quote_split(s, splitchar=';'): rv = [""] inquote = False for i in xrange(len(s)): if not inquote and s[i]==splitchar: rv.append( "" ) continue rv[-1] = rv[-1] + s[i] if s[i]=="'": inquote = not inquote return rv ## Return a compiled regex which matches the word-boundaried word 'x' ## See http://docs.python.org/2/library/re.html under the "\b" ## special matcher
def set_html(self, html): """ When setting the html for this Google Document we do two things: 1. We extract the content from the html. Using a regular expression we pull the meat of the document out of the body of the html, we also cut off the footer Google adds on automatically. 2. We extract the various sections from the content of the document. Again using a regular expression, we look for h1, h2, ... tags to split the document up into sections. Note: it is important when you are writing your Google Document to use the heading text styles, so this code will split things correctly. """ self._html = html self._extract_content() self._extract_sections()
def _construct_section_tree(self): """ For some weird reason Google Documents doesn't like nesting lists, so their table of contents requires a bunch of special formatting. Instead of trying to hack off what they provide us, we create a tree of sections based on each sections level. This tree will be used to construct the html for the table of contents. """ self._section_tree = TreeNode(Section(level=0)) current_node = self._section_tree for section in self._sections: while section['level'] <= current_node.value['level']: current_node = current_node.parent while section['level'] > current_node.value['level'] + 1: empty_section = Section(level=current_node.value['level'] + 1) current_node = current_node.add_child(empty_section) assert section['level'] == current_node.value['level'] + 1 current_node = current_node.add_child(section)
def _navigation_list(self, node=None): """ Return an html representation of the table of contents for this document. This is done recursively adding on a list item for each element in the tree, and an unordered list if this node has children. I might want to double check that this html is the correct way to nest lists. """ if node is None: self._construct_section_tree() return self._navigation_list(self._section_tree) result = "" if 'title' in node.value and 'id' in node.value: result += '<li>%s</li>' % node.value.url() if len(node) > 0: result += "<ul>%s</ul>" % \ "\n".join([self._navigation_list(child) for child in node]) return result
def get_db_connection(): if DbHelperM.__db_connection is None: DbHelperM.__db_connection = sqlite3.connect(wbd.wbd_global.get_database_filename()) # Upgrading the database # Very good upgrade explanation: # http://stackoverflow.com/questions/19331550/database-change-with-software-update # More info here: https://www.sqlite.org/pragma.html#pragma_schema_version current_db_ver_it = get_schema_version(DbHelperM.__db_connection) target_db_ver_it = max(upgrade_steps) for upgrade_step_it in range(current_db_ver_it + 1, target_db_ver_it + 1): if upgrade_step_it in upgrade_steps: upgrade_steps[upgrade_step_it](DbHelperM.__db_connection) set_schema_version(DbHelperM.__db_connection, upgrade_step_it) DbHelperM.__db_connection.commit() # TODO: Where do we close the db connection? (Do we need to close it?) # http://stackoverflow.com/questions/3850261/doing-something-before-program-exit return DbHelperM.__db_connection
def assert_found(patt, filename, msg=None, encoding='utf-8'): """Assert that regex pattern ``patt`` is found in the file ``filename``. :arg patt: The regex pattern to search. Any standard Python `regular expression <https://docs.python.org/3.6/library/re.html#regular-expression-syntax>`_ is accepted. :arg filename: The name of the file to examine. Any :class:`OSError` raised while processing the file will be propagated as a :class:`reframe.core.exceptions.SanityError`. :arg encoding: The name of the encoding used to decode the file. :returns: ``True`` on success. :raises reframe.core.exceptions.SanityError: if assertion fails. """ num_matches = count(finditer(patt, filename, encoding)) try: evaluate(assert_true(num_matches)) except SanityError: error_msg = msg or "pattern `{0}' not found in `{1}'" raise SanityError(_format(error_msg, patt, filename)) else: return True
def extractall(patt, filename, tag=0, conv=None, encoding='utf-8'): """Extract all values from the capturing group ``tag`` of a matching regex ``patt`` in the file ``filename``. :arg patt: The regex pattern to search. Any standard Python `regular expression <https://docs.python.org/3.6/library/re.html#regular-expression-syntax>`_ is accepted. :arg filename: The name of the file to examine. :arg encoding: The name of the encoding used to decode the file. :arg tag: The regex capturing group to be extracted. Group ``0`` refers always to the whole match. Since the file is processed line by line, this means that group ``0`` returns the whole line that was matched. :arg conv: A callable that takes a single argument and returns a new value. If provided, it will be used to convert the extracted values before returning them. :returns: A list of the extracted values from the matched regex. :raises reframe.core.exceptions.SanityError: In case of errors. """ return list(evaluate(x) for x in extractiter(patt, filename, tag, conv, encoding))
def __str__(self): return "the command '{0}' is unknown".format(self.msg) ## The command object will use a "line source" object which ## should expose (at least) the context protocol ## (http://docs.python.org/2/library/stdtypes.html#context-manager-types) ## and the iterator protocol ## (http://docs.python.org/2/library/stdtypes.html#iterator-types) ## ## The ".run( <linesource> )" method uses the line source object ## as: ## self.run( <linesource> ): ## with <linesource> as tmp: ## for line in tmp: ## execute( line ) ## ## This allows the linesource object to create a context which ## will be automagically destroyed when the ".run()" is finished. ## ## The "readkbd" line source class uses this context to save & ## restore the current history [== Python interactive shell's history] ## and to restore and save the history of the command line environment, ## the application's interactive shell. ## ## This way the two histories remain nicely separated/unpolluted ## This is the base class implementing a push/pop of current history ## and push/pop of (temporary) alternative history. Classes which ## want their own readline history saved can derive from this one.
def run(self): """Run module's code.""" conversations = self._get_conversation_list() # Output title = "Conversation List" header = None output_format = self.get_option_value("OUTPUT_FORMAT").lower() if (output_format == "stdout"): print title self._print_table(conversations) elif (output_format == "html"): if (not os.path.isdir(self.output_dir)): os.mkdir(self.output_dir) output_prefix = self.get_option_value("OUTPUT_FILE_NAME_PREFIX") file_full_path = self.output_dir + "/" + output_prefix + ".html" html.create_document_from_row_list(title, header, conversations, file_full_path) print "Output saved to: " + file_full_path elif (output_format == "pdf"): if (not os.path.isdir(self.output_dir)): os.mkdir(self.output_dir) output_prefix = self.get_option_value("OUTPUT_FILE_NAME_PREFIX") file_full_path = self.output_dir + "/" + output_prefix + ".pdf" pdf.create_document_from_row_list(title, header, conversations, file_full_path) print "Output saved to: " + file_full_path else: print "Unsupported OUTPUT_FORMAT" # *************************************************************** # HELPER methods # ***************************************************************
def to_html(self): return render_to_string('section.html', self)
def _extract_sections(self): """ Here is an example of what a section header looks like in the html of a Google Document: <h3 class="c1"><a name="h.699ffpepx6zs"></a><span>Hello World </span></h3> We split the content of the Google Document up using a regular expression that matches the above header. re.split is a pretty cool function if you haven't tried it before. It puts the matching groups into the list as well as the content between the matches. Check it out here: http://docs.python.org/library/re.html#re.split One big thing we do in this method is replace the ugly section id that Google creates with a nicely slugified version of the section title. This makes for pretty urls. """ self._sections = [] header = r'<h(?P<level>\d) class="[^"]+">' \ r'<a name="(?P<id>[^"]+)"></a>' \ r'<span>(?P<title>[^<]+)</span>' \ r'</h\d>' l = re.split(header, self._content) l.pop(0) while l: section = Section( # hack: cause we started with h3 in google docs level=int(l.pop(0)) - 2, id=l.pop(0), title=l.pop(0).decode('utf8'), content=l.pop(0), ) section['id'] = slugify(section['title']) if section['level'] >= 1: self._sections.append(section)
def to_html(self): """ Return a cleaned up HTML representation of this Google Document. """ return render_to_string('google_doc.html', { 'nav': self._navigation_html(), 'content': '\n'.join([s.to_html() for s in self._sections])})
def any_of(s): """s must be in the right format. See https://docs.python.org/3/library/re.html#regular-expression-syntax .""" return wrap('[', Dinant(s, escape=False), ']') # another helper function
def simple_lexer(rules): """ A very simple lexer factory based on a recipe on Python's regex module. """ # This is a simplified version of the techique described at # https://docs.python.org/3/library/re.html#writing-a-lexer regex = '|'.join(r'(?P<%s>%s)' % item for item in rules) regex += r'|(?P<whitespace>\s+)|(?P<error>.+)' regex = re.compile(regex) def lexer(expr): for match in re.finditer(regex, expr): typ = match.lastgroup value = match.group(typ) if typ == 'whitespace': continue elif typ == 'error': raise SyntaxError('invalid value: %r' % value) yield Token(typ, value) lexer.which = 'simple' return lexer
def get_all_tags_or_friends(i_special_char_str: str) -> list: ret_tag_tuple_list_list = [] # ret_tag_tuple_list_list: [("#tag1", [id1, id2, ___]), ("#tag2", [id1, id3, ___]), ___] db_connection = DbHelperM.get_db_connection() db_cursor = db_connection.cursor() db_cursor_result = db_cursor.execute( "SELECT * FROM " + DbSchemaM.DiaryEntryTable.name + " WHERE " + DbSchemaM.DiaryEntryTable.Cols.diary_entry + " LIKE " + '"%' + i_special_char_str + '%"' ) # -http://sqlite.org/lang_expr.html#like diary_db_te_list = db_cursor_result.fetchall() for diary_db_te in diary_db_te_list: diary_entry = DiaryEntryM(*diary_db_te) string_with_hashtag_str = diary_entry.diary_text t_diary_id_int = diary_entry.id regexp_pattern_obj = re.compile("\\" + i_special_char_str + r"\w+") # Please note: we need to escape the caret ("^") character becase this is a # special character ("literal") regexp_search_result_list = regexp_pattern_obj.findall(string_with_hashtag_str) # https://docs.python.org/3/library/re.html for t_re_tag_str in regexp_search_result_list: # -regexp_search_result_list: ["#tag1", "#tag2", ___] flag_boolean = False for (t_ret_tag_str, t_ret_diary_id_list) in ret_tag_tuple_list_list: if t_re_tag_str == t_ret_tag_str: t_ret_diary_id_list.append(t_diary_id_int) flag_boolean = True break if flag_boolean: break else: ret_tag_tuple_list_list.append((t_re_tag_str, [t_diary_id_int])) db_connection.commit() # TODO: Removing duplicates return ret_tag_tuple_list_list
def _get_mpl_date(dates, fmt='%d.%m.%Y %H:%M'): """Convert date strings into matplotlib time format. Parameters: dates (ndarray): Array containing date strings. fmt (str): Date string format [0]. [0] http://pubs.opengroup.org/onlinepubs/009695399/functions/strptime.html Returns: np.array: Matplotlib time values. """ return np.array([strpdate2num(fmt)(d) for d in dates])
def read_profile(filename, var_regex=None, var_key='PROFILE', **kwargs): """Read scattering coefficients from CSV file. Parameters: filename (str): Path to CSV file. var_regex (str): Python regular expression [0] matching the variable name of the profile. var_key (str): Dictionary key for extracted profile. **kwargs: Additional keyword arguments passed to `read`. [0] https://docs.python.org/3.1/library/re.html Returns: dict: Dictionary containing the data arrays and the stacked profile. """ profile_key = var_key + '_Z' output = read(filename, **kwargs) p = re.compile(var_regex) var_names = [var for var in output.keys() if p.match(var)] var_names.sort() profile = np.vstack([output[v] for v in var_names]) z = [float(re.sub('[^0-9]', '', v)) for v in var_names] # Extract height information from variable name. output[var_key] = np.ma.masked_invalid(profile) output[profile_key] = np.array(z) return output
def read_scat(filename, var_regex='CLB_B\d{5}', var_key='CLB_MATRIX', **kwargs): """Read scattering coefficients from CSV file. Parameters: filename (str): Path to CSV file. var_regex (str): Python regular expression [0] matching the variable name of the profile. var_key (str): Dictionary key for extracted profile. **kwargs: Additional keyword arguments passed to `read_profile`. [0] https://docs.python.org/3.1/library/re.html Returns: ndarray, ndarray: scattering coefficient, height levels """ output = read_profile( filename, var_key=var_key, var_regex=var_regex, **kwargs, ) back_scat = output[var_key] back_scat = np.ma.masked_less(back_scat, 0) output[var_key] = back_scat return output
def _convert_simple_pattern(self, regex_string): # EXPERIMENTAL """This is EXPERIMENTAL: Consider option to recognize "simple" patterns and automatically put them in the trie, otherwise use Python matcher. Convet a simple pattern to a form that can be inserted into a `RegexTrieDict`, if possible. Returns `None` if the pattern is too complicated. Simple pattern is essentially defined by what this routine is implemented to do (and a `RegexTrieDict` can/should do)""" return None # TODO the immediate below seems to work for some very simple patterns. simple_regex_patt = re.compile(r"^[a-zA-Z0-9_\-]+$", re.VERBOSE|re.UNICODE) match = simple_regex_patt.match(regex_string) if match is None: return None return regex_string # No processing needed for very simple. # SCRATCH BELOW # Note negative lookbehind assertion (?<!\\) for escape before # the strings which start Python regex special chars. non_simple_regex_contains = \ r"""( ( (?<!\\)[.^$*+?{[|(] )+ # Start of special char. | ( [\\][ABdDsSwWZ] )+ # Python regex escape. ))""" compiled_non_simple_regex_contains = re.compile( non_simple_regex_contains, re.VERBOSE|re.UNICODE) def is_simple_pattern(regex_string): # Could be single-char in brackets! # https://docs.python.org/2.0/ref/strings.html match_object = compiled_non_simple_regex_contains.search(regex_string) #matched_string = regex_string[match_object.start():match_object.end()] #print(" substring", matched_string) return not bool(match_object) #if is_simple_pattern(regex_string): # print("simple pattern", regex_string) #else: # print("non-simple pattern", regex_string)
def findall(patt, filename, encoding='utf-8'): """Get all matches of regex ``patt`` in ``filename``. :arg patt: The regex pattern to search. Any standard Python `regular expression <https://docs.python.org/3.6/library/re.html#regular-expression-syntax>`_ is accepted. :arg filename: The name of the file to examine. :arg encoding: The name of the encoding used to decode the file. :returns: A list of raw `regex match objects <https://docs.python.org/3.6/library/re.html#match-objects>`_. :raises reframe.core.exceptions.SanityError: In case an :class:`OSError` is raised while processing ``filename``. """ return list(evaluate(x) for x in finditer(patt, filename, encoding))
def parse_phone(phone_num): """Takes a phone number in a variety of formats and returns 10 digits. arguments: phone_num: string containing a phone number, in one of these formats: (555) 555-5555 (555)555-5555 555-555-5555 5555555555 returns: string of 10 digits (neglecting errors for now), or None if error. Examples / doctests: >>> print parse_phone("(555) 555-5555") 5555555555 >>> print parse_phone("(555)555-5555") 5555555555 >>> print parse_phone("555-555-5555") 5555555555 >>> print parse_phone("555555-5555") 5555555555 >>> print parse_phone("(555) 555-55555") None """ # a somewhat obscure regular expression to get the data out of the phone # number in various formats. (see http://regex101.com for more details on # -- and a sandbox for -- regular expressions.) matches = re.match(r'^\(?(\d{3})\)?[\s\-]?(\d{3})-?(\d{4})$', phone_num) if not matches: # the phone number wasn't in one of the acceptable formats return None # get the data from the regular expression # for more details, see # https://docs.python.org/2/library/re.html#match-objects area_code = matches.group(1) exchange = matches.group(2) other_part = matches.group(3) return "{}{}{}".format(area_code, exchange, other_part)
def read_ionization_energies( element ): """ Read the ionization energies from a data file Parameters ---------- element: string The atomic symbol of the considered ionizable species (e.g. 'He', 'N' ; do not use 'Helium' or 'Nitrogen') Returns ------- An array with one array element per ionization state, containing the ionization energy in Joules. """ # Open and read the file atomic_data.txt filename = os.path.join( os.path.dirname(__file__), 'atomic_data.txt' ) with open(filename) as f: text_data = f.read() # Parse the data using regular expressions (a.k.a. regex) # (see https://docs.python.org/2/library/re.html) # The regex command below parses lines of the type # '\n 10 | Ne IV | +3 | [97.1900]' # and only considers those for which the element (Ne in the above example) # matches the element which is passed as argument of this function # For each line that satisfies this requirement, it extracts a tuple with # - the atomic number (represented as (\d+)) # - the ionization level (represented as the second (\d+)) # - the ionization energy (represented as (\d+\.*\d*)) regex_command = \ '\n\s+(\d+)\s+\|\s+%s\s+\w+\s+\|\s+\+*(\d+)\s+\|\s+\(*\[*(\d+\.*\d*)' \ %element list_of_tuples = re.findall( regex_command, text_data ) # Return None if the requested element was not found if list_of_tuples == []: return(None) # Go through the list of tuples and fill the array of ionization energies. atomic_number = int( list_of_tuples[0][0] ) assert atomic_number > 0 energies = np.zeros( atomic_number ) for ion_level in range( atomic_number ): # Check that, when reading the file, # we obtained the correct ionization level assert ion_level == int( list_of_tuples[ion_level][1] ) # Get the ionization energy and convert in Joules using e energies[ ion_level ] = e * float( list_of_tuples[ion_level][2] ) return( energies )
def initial_schema_and_setup(i_db_conn): """Auto-increment is not needed in our case: https://www.sqlite.org/autoinc.html """ i_db_conn.execute( "CREATE TABLE " + DbSchemaM.QuestionTable.name + "(" + DbSchemaM.QuestionTable.Cols.id + " INTEGER PRIMARY KEY, " + DbSchemaM.QuestionTable.Cols.sort_order + " INTEGER NOT NULL, " + DbSchemaM.QuestionTable.Cols.title + " TEXT NOT NULL, " + DbSchemaM.QuestionTable.Cols.question + " TEXT NOT NULL DEFAULT '', " + DbSchemaM.QuestionTable.Cols.archived + " INTEGER DEFAULT " + str(SQLITE_FALSE) + ")" ) i_db_conn.execute( "INSERT INTO " + DbSchemaM.QuestionTable.name + "(" + DbSchemaM.QuestionTable.Cols.id + ", " + DbSchemaM.QuestionTable.Cols.sort_order + ", " + DbSchemaM.QuestionTable.Cols.title + ", " + DbSchemaM.QuestionTable.Cols.question + ") VALUES (?, ?, ?, ?)", (wbd.wbd_global.NO_ACTIVE_QUESTION_INT, -1, "<i>no question</i>", "") ) i_db_conn.execute( "CREATE TABLE " + DbSchemaM.DiaryEntryTable.name + "(" + DbSchemaM.DiaryEntryTable.Cols.id + " INTEGER PRIMARY KEY, " + DbSchemaM.DiaryEntryTable.Cols.date_added + " INTEGER, " + DbSchemaM.DiaryEntryTable.Cols.favorite + " INTEGER NOT NULL DEFAULT '" + str(SQLITE_FALSE) + "', " + DbSchemaM.DiaryEntryTable.Cols.diary_entry + " TEXT, " + DbSchemaM.DiaryEntryTable.Cols.question_ref + " INTEGER REFERENCES " + DbSchemaM.QuestionTable.name + "(" + DbSchemaM.QuestionTable.Cols.id + ")" + " NOT NULL DEFAULT '" + str(wbd.wbd_global.NO_ACTIVE_QUESTION_INT) + "'" + ")" ) # + " NOT NULL DEFAULT '" + str(wbd.bwbglobal.NO_ACTIVE_QUESTION_INT) + "'" """ i_db_conn.execute( "CREATE INDEX " + DbSchemaM.DiaryEntryTable.name + "(" + ")" ) """ i_db_conn.execute( "CREATE TABLE " + DbSchemaM.ReminderTable.name + "(" + DbSchemaM.ReminderTable.Cols.id + " INTEGER PRIMARY KEY, " + DbSchemaM.ReminderTable.Cols.title + " TEXT DEFAULT '', " + DbSchemaM.ReminderTable.Cols.reminder + " TEXT DEFAULT ''" + ")" ) if wbd.wbd_global.testing_bool: populate_db_with_test_data()
def map(self, regex): """Decorate a function to map it to a regex expression. Decorator that allows to map a Python function to a regex expression provided as argument. The regex expression is parsed using the 're' module (https://docs.python.org/3/library/re.html). Therefore, all regex syntax that this module supports is also supported by this regex matcher. To encapsulate arguments for the function in the regex expression use named capture groups (see examples). Inspired by the URL mapping of the Flask microframework (http://flask.pocoo.org/) Args: regex: An Python raw string (r'I am a raw string') that can be interpreted as regular expression Returns: func: The input function without modfication Example: The simplest case just maps a function to a name: >>> mapper = FuncMapper() >>> @mapper.map(r'a name') ... def my_func(): ... return 'I, my_func, have been called' >>> mapper('a name') 'I, my_func, have been called' But you can also use variables by adding regex capture groups: (Note: Only named match groups are supported!) >>> mapper = FuncMapper() >>> @mapper.map(r'(?P<first>\d+)\+(?P<second>\d+)') ... def adder(first, second): ... return '{} + {} = {}'.format(first, second, int(first) + int(second)) >>> mapper('3+5') '3 + 5 = 8' """ def wrapper(f): self._mapped_functions[regex] = f compiled_regex = re.compile(regex) if compiled_regex.groups > 0 and len(compiled_regex.groupindex) < compiled_regex.groups: raise NotSupportedError("Only named matched groups are Supported!") self._mapped_regex[regex] = compiled_regex return f return wrapper