我正在尝试使用Python处理一些使用Adobe Acrobat Reader填写并签名的PDF表单。
我试过了:
我可以继续寻找图书馆并尝试使用它们,但我希望有人已经对此提供了有效的解决方案。
更新: 根据史蒂文的答案,我研究了pdfminer,它的技巧很好。
from argparse import ArgumentParser import pickle import pprint from pdfminer.pdfparser import PDFParser, PDFDocument from pdfminer.pdftypes import resolve1, PDFObjRef def load_form(filename): """Load pdf form contents into a nested list of name/value tuples""" with open(filename, 'rb') as file: parser = PDFParser(file) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize() return [load_fields(resolve1(f)) for f in resolve1(doc.catalog['AcroForm'])['Fields']] def load_fields(field): """Recursively load form fields""" form = field.get('Kids', None) if form: return [load_fields(resolve1(f)) for f in form] else: # Some field types, like signatures, need extra resolving return (field.get('T').decode('utf-16'), resolve1(field.get('V'))) def parse_cli(): """Load command line arguments""" parser = ArgumentParser(description='Dump the form contents of a PDF.') parser.add_argument('file', metavar='pdf_form', help='PDF Form to dump the contents of') parser.add_argument('-o', '--out', help='Write output to file', default=None, metavar='FILE') parser.add_argument('-p', '--pickle', action='store_true', default=False, help='Format output for python consumption') return parser.parse_args() def main(): args = parse_cli() form = load_form(args.file) if args.out: with open(args.out, 'w') as outfile: if args.pickle: pickle.dump(form, outfile) else: pp = pprint.PrettyPrinter(indent=2) file.write(pp.pformat(form)) else: if args.pickle: print pickle.dumps(form) else: pp = pprint.PrettyPrinter(indent=2) pp.pprint(form) if __name__ == '__main__': main()
您应该能够使用pdfminer做到这一点,但这将需要深入研究pdfminer的内部结构以及有关pdf格式的知识(当然是wrt形式,但也需要了解pdf的内部结构,例如“字典”和“间接对象”) 。
该示例可能会为您提供帮助(我认为它仅适用于简单情况,没有嵌套字段等)。
import sys from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdftypes import resolve1 filename = sys.argv[1] fp = open(filename, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) fields = resolve1(doc.catalog['AcroForm'])['Fields'] for i in fields: field = resolve1(i) name, value = field.get('T'), field.get('V') print '{0}: {1}'.format(name, value)
编辑:忘记提及:如果您需要提供密码,请将其传递给 doc.initialize()
doc.initialize()