我们从Python开源项目中,提取了以下14个代码示例,用于说明如何使用fileinput.hook_compressed()。
def read_and_clean_files(clueweb_file, ann_file, data_dir, ann_dir): """ Read file from data_dir and ann_dir, replace entity mentions and clean records in that file :param clueweb_file: :param ann_file: :param data_dir: Warc files directory :param ann_dir: Annotations directory :return: {'record_id': record_id, 'replaced_record': cleaned_replaced_record, 'cleaned_record': cleaned_record} """ annotation_input = fileinput.FileInput(os.path.join(ann_dir, ann_file), openhook=fileinput.hook_compressed) annotation_list = [] for line in annotation_input: annotation_list.append(Annotation.parse_annotation(line)) warc_path = os.path.join(data_dir, clueweb_file) warc_file = warc.open(warc_path) print "Replacing entity mentions for ", clueweb_file, ":", ann_file, "..." start = time.time() warc_entry = WarcEntry(warc_path, warc_file, annotation_list) cleaned_records = warc_entry.replace_entity_mentions() end = time.time() print "Time used: ", end - start warc_file.close() return cleaned_records
def _open(self, fp): if 'xml' in fp: return fileinput.hook_compressed(fp, 'r') if fp.endswith('.gz'): reader = codecs.getreader("utf-8") return reader(gzip.open(fp)) return codecs.open(fp, encoding='utf-8', mode='r')
def test_gz_ext_fake(self): original_open = gzip.open gzip.open = self.fake_open try: result = fileinput.hook_compressed("test.gz", 3) finally: gzip.open = original_open self.assertEqual(self.fake_open.invocation_count, 1) self.assertEqual(self.fake_open.last_invocation, (("test.gz", 3), {}))
def test_bz2_ext_fake(self): original_open = bz2.BZ2File bz2.BZ2File = self.fake_open try: result = fileinput.hook_compressed("test.bz2", 4) finally: bz2.BZ2File = original_open self.assertEqual(self.fake_open.invocation_count, 1) self.assertEqual(self.fake_open.last_invocation, (("test.bz2", 4), {}))
def do_test_use_builtin_open(self, filename, mode): original_open = self.replace_builtin_open(self.fake_open) try: result = fileinput.hook_compressed(filename, mode) finally: self.replace_builtin_open(original_open) self.assertEqual(self.fake_open.invocation_count, 1) self.assertEqual(self.fake_open.last_invocation, ((filename, mode), {}))
def read_input(self): """Read input data from either stdin or file, store raw data, copy""" # threshold for text headers or bad data MAX_ERR_COUNT = 5 error_count = 0 for row in csv.reader(fileinput.FileInput(args, openhook=fileinput.hook_compressed)): if error_count >= MAX_ERR_COUNT: sys.stderr.write("\n# Exiting. Maximum invalid input reached (count={}).".format(error_count)) sys.stderr.write("\n# Check input data.") sys.exit() try: # is this an epoch timestamp? x_val = float( row[self.indep_col] ) except ValueError as e: # is this an Activity Streams (UTC) timestamp? row_time_match = self.time_re.search(row[self.indep_col]) if row_time_match is None: error_count += 1 sys.stderr.write("# could not parse row: {} (errors: {}) \n".format(','.join(row), error_count)) # skip to next row continue row_time = row_time_match.group(0) # parse ts, convert to epoch secs x_ts = datetime.strptime(row_time, self.time_format) x_val = (x_ts - datetime(1970, 1, 1)).total_seconds() y_val = float( row[self.dep_col] ) # build lists of raw data self.raw_x.append(x_val) self.raw_y.append(y_val) # the length of the input array is useful for other steps self.x_array_size = len(self.raw_x)
def process(input_file, cb = None, lim = None): file = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed) for ix, page_data in enumerate(pages_from(file)): if lim and lim < ix: break aid, revid, title, ns, page = page_data page_content = "\n".join(page) cb(aid, title, page_content) file.close()
def _get_more_data(self, j): def delta(e, s): return (self._parse_ts(e) - self._parse_ts(s)).seconds / 60 start = end = last = None j.update({ 'status': 'FAILURE', 'fail': True, 'branch': '', 'length': 0, }) console = self._get_console(j) if not console: log.error("Failed to get console for job {}".format(repr(j))) return None else: finput = fileinput.FileInput(console, openhook=fileinput.hook_compressed) for line in finput: line = line.decode() if ('| SUCCESSFULLY FINISHED' in line): j['fail'] = False j['status'] = 'SUCCESS' elif ('| *** FAILED' in line): j['fail'] = True j['status'] = 'FAILURE' elif ("Finished: ABORTED" in line or '[Zuul] Job complete, result: ABORTED' in line): j['fail'] = True j['status'] = 'ABORTED' if ' Pipeline:' in line: j['pipeline'] = (pipe_re.search(line).group(1) if pipe_re.search(line) else '') if branch_re.search(line): j['branch'] = branch_re.search(line).group(1) try: if ('Started by user' in line or '[Zuul] Launched by' in line or '| PRE-RUN START' in line): start = ts_re.search(line).group(1) if ("| Run completed" in line or '[Zuul] Job complete' in line or '| POST-RUN START' in line): end = ts_re.search(line).group(1) except Exception as e: log.error(e) return None if ts_re.search(line): last = ts_re.search(line).group(1) end = end or last j['length'] = delta(end, start) if start and end else 0 j['ts'] = self._parse_ts(end) if end else j['ts'] finput.close() return j