Python fileinput 模块,hook_compressed() 实例源码

我们从Python开源项目中,提取了以下14个代码示例,用于说明如何使用fileinput.hook_compressed()

项目:tino-thesis    作者:Tino92    | 项目源码 | 文件源码
def read_and_clean_files(clueweb_file, ann_file, data_dir, ann_dir):
    """
    Read file from data_dir and ann_dir, replace entity mentions and clean records in that file
    :param clueweb_file:
    :param ann_file:
    :param data_dir: Warc files directory
    :param ann_dir: Annotations directory
    :return: {'record_id': record_id,
        'replaced_record': cleaned_replaced_record,
        'cleaned_record': cleaned_record}
    """
    annotation_input = fileinput.FileInput(os.path.join(ann_dir, ann_file), openhook=fileinput.hook_compressed)
    annotation_list = []
    for line in annotation_input:
    annotation_list.append(Annotation.parse_annotation(line))

    warc_path = os.path.join(data_dir, clueweb_file)
    warc_file = warc.open(warc_path)
    print "Replacing entity mentions for ", clueweb_file, ":", ann_file, "..."
    start = time.time()
    warc_entry = WarcEntry(warc_path, warc_file, annotation_list)
    cleaned_records = warc_entry.replace_entity_mentions()
    end = time.time()
    print "Time used: ", end - start
    warc_file.close()
    return cleaned_records
项目:gennotes    作者:madprime    | 项目源码 | 文件源码
def _open(self, fp):
        if 'xml' in fp:
            return fileinput.hook_compressed(fp, 'r')

        if fp.endswith('.gz'):
            reader = codecs.getreader("utf-8")
            return reader(gzip.open(fp))
        return codecs.open(fp, encoding='utf-8', mode='r')
项目:web_ctp    作者:molebot    | 项目源码 | 文件源码
def test_gz_ext_fake(self):
        original_open = gzip.open
        gzip.open = self.fake_open
        try:
            result = fileinput.hook_compressed("test.gz", 3)
        finally:
            gzip.open = original_open

        self.assertEqual(self.fake_open.invocation_count, 1)
        self.assertEqual(self.fake_open.last_invocation, (("test.gz", 3), {}))
项目:web_ctp    作者:molebot    | 项目源码 | 文件源码
def test_bz2_ext_fake(self):
        original_open = bz2.BZ2File
        bz2.BZ2File = self.fake_open
        try:
            result = fileinput.hook_compressed("test.bz2", 4)
        finally:
            bz2.BZ2File = original_open

        self.assertEqual(self.fake_open.invocation_count, 1)
        self.assertEqual(self.fake_open.last_invocation, (("test.bz2", 4), {}))
项目:web_ctp    作者:molebot    | 项目源码 | 文件源码
def do_test_use_builtin_open(self, filename, mode):
        original_open = self.replace_builtin_open(self.fake_open)
        try:
            result = fileinput.hook_compressed(filename, mode)
        finally:
            self.replace_builtin_open(original_open)

        self.assertEqual(self.fake_open.invocation_count, 1)
        self.assertEqual(self.fake_open.last_invocation,
                         ((filename, mode), {}))
项目:social-media-pulse    作者:jrmontag    | 项目源码 | 文件源码
def read_input(self):
        """Read input data from either stdin or file, store raw data, copy"""
        # threshold for text headers or bad data 
        MAX_ERR_COUNT = 5
        error_count = 0
        for row in csv.reader(fileinput.FileInput(args, openhook=fileinput.hook_compressed)):
            if error_count >= MAX_ERR_COUNT:
                sys.stderr.write("\n# Exiting. Maximum invalid input reached (count={}).".format(error_count)) 
                sys.stderr.write("\n# Check input data.")
                sys.exit() 
            try:
                # is this an epoch timestamp?
                x_val = float( row[self.indep_col] )
            except ValueError as e:
                # is this an Activity Streams (UTC) timestamp?
                row_time_match = self.time_re.search(row[self.indep_col])
                if row_time_match is None:
                    error_count += 1 
                    sys.stderr.write("# could not parse row: {} (errors: {}) \n".format(','.join(row), error_count)) 
                    # skip to next row
                    continue
                row_time = row_time_match.group(0)
                # parse ts, convert to epoch secs
                x_ts = datetime.strptime(row_time, self.time_format)
                x_val = (x_ts - datetime(1970, 1, 1)).total_seconds() 
            y_val = float( row[self.dep_col] )
            # build lists of raw data 
            self.raw_x.append(x_val)
            self.raw_y.append(y_val)
        # the length of the input array is useful for other steps
        self.x_array_size = len(self.raw_x)
项目:ouroboros    作者:pybee    | 项目源码 | 文件源码
def test_gz_ext_fake(self):
        original_open = gzip.open
        gzip.open = self.fake_open
        try:
            result = fileinput.hook_compressed("test.gz", 3)
        finally:
            gzip.open = original_open

        self.assertEqual(self.fake_open.invocation_count, 1)
        self.assertEqual(self.fake_open.last_invocation, (("test.gz", 3), {}))
项目:ouroboros    作者:pybee    | 项目源码 | 文件源码
def test_bz2_ext_fake(self):
        original_open = bz2.BZ2File
        bz2.BZ2File = self.fake_open
        try:
            result = fileinput.hook_compressed("test.bz2", 4)
        finally:
            bz2.BZ2File = original_open

        self.assertEqual(self.fake_open.invocation_count, 1)
        self.assertEqual(self.fake_open.last_invocation, (("test.bz2", 4), {}))
项目:ouroboros    作者:pybee    | 项目源码 | 文件源码
def do_test_use_builtin_open(self, filename, mode):
        original_open = self.replace_builtin_open(self.fake_open)
        try:
            result = fileinput.hook_compressed(filename, mode)
        finally:
            self.replace_builtin_open(original_open)

        self.assertEqual(self.fake_open.invocation_count, 1)
        self.assertEqual(self.fake_open.last_invocation,
                         ((filename, mode), {}))
项目:kbe_server    作者:xiaohaoppy    | 项目源码 | 文件源码
def test_gz_ext_fake(self):
        original_open = gzip.open
        gzip.open = self.fake_open
        try:
            result = fileinput.hook_compressed("test.gz", 3)
        finally:
            gzip.open = original_open

        self.assertEqual(self.fake_open.invocation_count, 1)
        self.assertEqual(self.fake_open.last_invocation, (("test.gz", 3), {}))
项目:kbe_server    作者:xiaohaoppy    | 项目源码 | 文件源码
def test_bz2_ext_fake(self):
        original_open = bz2.BZ2File
        bz2.BZ2File = self.fake_open
        try:
            result = fileinput.hook_compressed("test.bz2", 4)
        finally:
            bz2.BZ2File = original_open

        self.assertEqual(self.fake_open.invocation_count, 1)
        self.assertEqual(self.fake_open.last_invocation, (("test.bz2", 4), {}))
项目:kbe_server    作者:xiaohaoppy    | 项目源码 | 文件源码
def do_test_use_builtin_open(self, filename, mode):
        original_open = self.replace_builtin_open(self.fake_open)
        try:
            result = fileinput.hook_compressed(filename, mode)
        finally:
            self.replace_builtin_open(original_open)

        self.assertEqual(self.fake_open.invocation_count, 1)
        self.assertEqual(self.fake_open.last_invocation,
                         ((filename, mode), {}))
项目:hyperbolic-caching    作者:kantai    | 项目源码 | 文件源码
def process(input_file, cb = None, lim = None):
    file = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed)
    for ix, page_data in enumerate(pages_from(file)):
        if lim and lim < ix:
            break
        aid, revid, title, ns, page = page_data
        page_content = "\n".join(page)
        cb(aid, title, page_content)
    file.close()
项目:sova    作者:sshnaidm    | 项目源码 | 文件源码
def _get_more_data(self, j):
        def delta(e, s):
            return (self._parse_ts(e) - self._parse_ts(s)).seconds / 60

        start = end = last = None
        j.update({
            'status': 'FAILURE',
            'fail': True,
            'branch': '',
            'length': 0,
        })
        console = self._get_console(j)
        if not console:
            log.error("Failed to get console for job {}".format(repr(j)))
            return None
        else:
            finput = fileinput.FileInput(console,
                                         openhook=fileinput.hook_compressed)
            for line in finput:
                line = line.decode()
                if ('|  SUCCESSFULLY FINISHED' in line):
                    j['fail'] = False
                    j['status'] = 'SUCCESS'
                elif ('|  *** FAILED' in line):
                    j['fail'] = True
                    j['status'] = 'FAILURE'
                elif ("Finished: ABORTED" in line or
                        '[Zuul] Job complete, result: ABORTED' in line):
                    j['fail'] = True
                    j['status'] = 'ABORTED'
                if '  Pipeline:' in line:
                    j['pipeline'] = (pipe_re.search(line).group(1)
                                     if pipe_re.search(line) else '')
                if branch_re.search(line):
                    j['branch'] = branch_re.search(line).group(1)
                try:
                    if ('Started by user' in line or
                            '[Zuul] Launched by' in line or
                            '| PRE-RUN START' in line):
                        start = ts_re.search(line).group(1)
                    if ("|  Run completed" in line or
                            '[Zuul] Job complete' in line or
                            '| POST-RUN START' in line):
                        end = ts_re.search(line).group(1)
                except Exception as e:
                    log.error(e)
                    return None
                if ts_re.search(line):
                    last = ts_re.search(line).group(1)
            end = end or last
            j['length'] = delta(end, start) if start and end else 0
            j['ts'] = self._parse_ts(end) if end else j['ts']
            finput.close()
        return j