我需要读取一些大文件(从5万行到10万行),这些文件以空行分隔成组。每组以相同的样式“ No.999999999 dd / mm / yyyy ZZZ”开始。这是一些示例数据。
No.813829461 09/16/1987 270 SUZANO PAPEL E CELULOSE SA(BR / BA) CNPJ / CIC / N INPI:16404287000155 律师:MARCELLO DO NASCIMENTO No.815326777 12/28/1989 351 Tit.SIGLA SISTEMA GLOBO DE GRAVACOES AUDIO VISUAIS LTDA(BR / RJ) CNPJ / CIC / INPINº:34162651000108代表 :主格; 类别:产品 品牌:TRIO TROPICAL产品/ 服务:09.40 *根据2006年6月1日第123号决议的规定,2006年1月24日在RPI 1829中发布。 律师:WALDEMAR RODRIGUES PEDRA No.900148764 2007年11月1日LD3 Tit.TIARA BOLSAS ECALÇADOSLTDA 律师:Marcia Ferreira Gomes 办公室:Marcas Marcantes e CN Ltda 根据157条的规定,没有形式要求令人满意,商标注册请求不存在的LPI *符合正式要求的协议:810080140197
No.813829461 09/16/1987 270 SUZANO PAPEL E CELULOSE SA(BR / BA) CNPJ / CIC / N INPI:16404287000155 律师:MARCELLO DO NASCIMENTO
No.815326777 12/28/1989 351 Tit.SIGLA SISTEMA GLOBO DE GRAVACOES AUDIO VISUAIS LTDA(BR / RJ) CNPJ / CIC / INPINº:34162651000108代表 :主格; 类别:产品 品牌:TRIO TROPICAL产品/ 服务:09.40 *根据2006年6月1日第123号决议的规定,2006年1月24日在RPI 1829中发布。 律师:WALDEMAR RODRIGUES PEDRA
No.900148764 2007年11月1日LD3 Tit.TIARA BOLSAS ECALÇADOSLTDA 律师:Marcia Ferreira Gomes 办公室:Marcas Marcantes e CN Ltda 根据157条的规定,没有形式要求令人满意,商标注册请求不存在的LPI *符合正式要求的协议:810080140197
我写了一些相应地解析它的代码。有什么我需要改进的地方,以提高可读性或性能?这是我到目前为止的内容:
import re, pprint class Despacho(object): """ Class to parse each line, applying the regexp and storing the results for future use """ regexp = { re.compile(r'No.([\d]{9}) ([\d]{2}/[\d]{2}/[\d]{4}) (.*)'): lambda self: self._processo, re.compile(r'Tit.(.*)'): lambda self: self._titular, re.compile(r'Procurador: (.*)'): lambda self: self._procurador, re.compile(r'C.N.P.J./C.I.C./N INPI :(.*)'): lambda self: self._documento, re.compile(r'Apres.: (.*) ; Nat.: (.*)'): lambda self: self._apresentacao, re.compile(r'Marca: (.*)'): lambda self: self._marca, re.compile(r'Clas.Prod/Serv: (.*)'): lambda self: self._classe, re.compile(r'\*(.*)'): lambda self: self._complemento, } def __init__(self): """ 'complemento' is the only field that can be multiple in a single registry """ self.complemento = [] def _processo(self, matches): self.processo, self.data, self.despacho = matches.groups() def _titular(self, matches): self.titular = matches.group(1) def _procurador(self, matches): self.procurador = matches.group(1) def _documento(self, matches): self.documento = matches.group(1) def _apresentacao(self, matches): self.apresentacao, self.natureza = matches.groups() def _marca(self, matches): self.marca = matches.group(1) def _classe(self, matches): self.classe = matches.group(1) def _complemento(self, matches): self.complemento.append(matches.group(1)) def read(self, line): for pattern in Despacho.regexp: m = pattern.match(line) if m: Despacho.regexp[pattern](self)(m) def process(rpi): """ read data and process each group """ rpi = (line for line in rpi) group = False for line in rpi: if line.startswith('No.'): group = True d = Despacho() if not line.strip() and group: # empty line - end of block yield d group = False d.read(line) arquivo = open('rm1972.txt') # file to process for desp in process(arquivo): pprint.pprint(desp.__dict__) print('--------------')
很好 下面是一些建议,如果您喜欢,请告诉我:
import re import pprint import sys class Despacho(object): """ Class to parse each line, applying the regexp and storing the results for future use """ #used a dict with the keys instead of functions. regexp = { ('processo', 'data', 'despacho'): re.compile(r'No.([\d]{9}) ([\d]{2}/[\d]{2}/[\d]{4}) (.*)'), ('titular',): re.compile(r'Tit.(.*)'), ('procurador',): re.compile(r'Procurador: (.*)'), ('documento',): re.compile(r'C.N.P.J./C.I.C./N INPI :(.*)'), ('apresentacao', 'natureza'): re.compile(r'Apres.: (.*) ; Nat.: (.*)'), ('marca',): re.compile(r'Marca: (.*)'), ('classe',): re.compile(r'Clas.Prod/Serv: (.*)'), ('complemento',): re.compile(r'\*(.*)'), } def __init__(self): """ 'complemento' is the only field that can be multiple in a single registry """ self.complemento = [] def read(self, line): for attrs, pattern in Despacho.regexp.iteritems(): m = pattern.match(line) if m: for groupn, attr in enumerate(attrs): # special case complemento: if attr == 'complemento': self.complemento.append(m.group(groupn + 1)) else: # set the attribute on the object setattr(self, attr, m.group(groupn + 1)) def __repr__(self): # defines object printed representation d = {} for attrs in self.regexp: for attr in attrs: d[attr] = getattr(self, attr, None) return pprint.pformat(d) def process(rpi): """ read data and process each group """ #Useless line, since you're doing a for anyway #rpi = (line for line in rpi) group = False for line in rpi: if line.startswith('No.'): group = True d = Despacho() if not line.strip() and group: # empty line - end of block yield d group = False d.read(line) def main(): arquivo = open('rm1972.txt') # file to process for desp in process(arquivo): print desp # can print directly here. print('-' * 20) return 0 if __name__ == '__main__': main()