Python nltk.tokenize 模块,RegexpTokenizer() 实例源码

我们从Python开源项目中,提取了以下49个代码示例,用于说明如何使用nltk.tokenize.RegexpTokenizer()

项目:Price-Comparator    作者:Thejas-1    | 项目源码 | 文件源码
def __init__(self, root, fileids,
                 sep='/', word_tokenizer=WhitespaceTokenizer(),
                 sent_tokenizer=RegexpTokenizer('\n', gaps=True),
                 alignedsent_block_reader=read_alignedsent_block,
                 encoding='latin1'):
        """
        Construct a new Aligned Corpus reader for a set of documents
        located at the given root directory.  Example usage:

            >>> root = '/...path to corpus.../'
            >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP

        :param root: The root directory for this corpus.
        :param fileids: A list or regexp specifying the fileids in this corpus.
        """
        CorpusReader.__init__(self, root, fileids, encoding)
        self._sep = sep
        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer
        self._alignedsent_block_reader = alignedsent_block_reader
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def __init__(self, root, fileids,
                 sep='/', word_tokenizer=WhitespaceTokenizer(),
                 sent_tokenizer=RegexpTokenizer('\n', gaps=True),
                 alignedsent_block_reader=read_alignedsent_block,
                 encoding='latin1'):
        """
        Construct a new Aligned Corpus reader for a set of documents
        located at the given root directory.  Example usage:

            >>> root = '/...path to corpus.../'
            >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP

        :param root: The root directory for this corpus.
        :param fileids: A list or regexp specifying the fileids in this corpus.
        """
        CorpusReader.__init__(self, root, fileids, encoding)
        self._sep = sep
        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer
        self._alignedsent_block_reader = alignedsent_block_reader
项目:neighborhood_mood_aws    作者:jarrellmark    | 项目源码 | 文件源码
def __init__(self, root, fileids,
                 sep='/', word_tokenizer=WhitespaceTokenizer(),
                 sent_tokenizer=RegexpTokenizer('\n', gaps=True),
                 alignedsent_block_reader=read_alignedsent_block,
                 encoding='latin1'):
        """
        Construct a new Aligned Corpus reader for a set of documents
        located at the given root directory.  Example usage:

            >>> root = '/...path to corpus.../'
            >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP

        :param root: The root directory for this corpus.
        :param fileids: A list or regexp specifying the fileids in this corpus.
        """
        CorpusReader.__init__(self, root, fileids, encoding)
        self._sep = sep
        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer
        self._alignedsent_block_reader = alignedsent_block_reader
项目:Artificial-Intelligence-with-Python    作者:PacktPublishing    | 项目源码 | 文件源码
def process(input_text):
    # Create a regular expression tokenizer
    tokenizer = RegexpTokenizer(r'\w+')

    # Create a Snowball stemmer 
    stemmer = SnowballStemmer('english')

    # Get the list of stop words 
    stop_words = stopwords.words('english')

    # Tokenize the input string
    tokens = tokenizer.tokenize(input_text.lower())

    # Remove the stop words 
    tokens = [x for x in tokens if not x in stop_words]

    # Perform stemming on the tokenized words 
    tokens_stemmed = [stemmer.stem(x) for x in tokens]

    return tokens_stemmed
项目:sentiment-analysis    作者:saber1988    | 项目源码 | 文件源码
def paragraph_to_words(paragraph, remove_stopwords=False, lemmatize=True, stem=False):
    words = BeautifulSoup(paragraph["review"], "html.parser").get_text()
    words = re.sub("[^a-zA-Z]", " ", words)
    # tokenizer = RegexpTokenizer(r'\w+')
    # words = tokenizer.tokenize(words.strip().lower())
    words = words.lower().split()

    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]

    if lemmatize:
        words = [lemmatizer.lemmatize(w) for w in words]

    if stem:
        words = [stemmer.stem(w) for w in words]

    return LabelDoc(words, paragraph["id"])
项目:hate-to-hugs    作者:sdoran35    | 项目源码 | 文件源码
def __init__(self, root, fileids,
                 sep='/', word_tokenizer=WhitespaceTokenizer(),
                 sent_tokenizer=RegexpTokenizer('\n', gaps=True),
                 alignedsent_block_reader=read_alignedsent_block,
                 encoding='latin1'):
        """
        Construct a new Aligned Corpus reader for a set of documents
        located at the given root directory.  Example usage:

            >>> root = '/...path to corpus.../'
            >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP

        :param root: The root directory for this corpus.
        :param fileids: A list or regexp specifying the fileids in this corpus.
        """
        CorpusReader.__init__(self, root, fileids, encoding)
        self._sep = sep
        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer
        self._alignedsent_block_reader = alignedsent_block_reader
项目:FancyWord    作者:EastonLee    | 项目源码 | 文件源码
def __init__(self, root, fileids,
                 sep='/', word_tokenizer=WhitespaceTokenizer(),
                 sent_tokenizer=RegexpTokenizer('\n', gaps=True),
                 alignedsent_block_reader=read_alignedsent_block,
                 encoding='latin1'):
        """
        Construct a new Aligned Corpus reader for a set of documents
        located at the given root directory.  Example usage:

            >>> root = '/...path to corpus.../'
            >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP

        :param root: The root directory for this corpus.
        :param fileids: A list or regexp specifying the fileids in this corpus.
        """
        CorpusReader.__init__(self, root, fileids, encoding)
        self._sep = sep
        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer
        self._alignedsent_block_reader = alignedsent_block_reader
项目:beepboop    作者:nicolehe    | 项目源码 | 文件源码
def __init__(self, root, fileids,
                 sep='/', word_tokenizer=WhitespaceTokenizer(),
                 sent_tokenizer=RegexpTokenizer('\n', gaps=True),
                 alignedsent_block_reader=read_alignedsent_block,
                 encoding='latin1'):
        """
        Construct a new Aligned Corpus reader for a set of documents
        located at the given root directory.  Example usage:

            >>> root = '/...path to corpus.../'
            >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP

        :param root: The root directory for this corpus.
        :param fileids: A list or regexp specifying the fileids in this corpus.
        """
        CorpusReader.__init__(self, root, fileids, encoding)
        self._sep = sep
        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer
        self._alignedsent_block_reader = alignedsent_block_reader
项目:kind2anki    作者:prz3m    | 项目源码 | 文件源码
def __init__(self, root, fileids,
                 sep='/', word_tokenizer=WhitespaceTokenizer(),
                 sent_tokenizer=RegexpTokenizer('\n', gaps=True),
                 alignedsent_block_reader=read_alignedsent_block,
                 encoding='latin1'):
        """
        Construct a new Aligned Corpus reader for a set of documents
        located at the given root directory.  Example usage:

            >>> root = '/...path to corpus.../'
            >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP

        :param root: The root directory for this corpus.
        :param fileids: A list or regexp specifying the fileids in this corpus.
        """
        CorpusReader.__init__(self, root, fileids, encoding)
        self._sep = sep
        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer
        self._alignedsent_block_reader = alignedsent_block_reader
项目:but_sentiment    作者:MixedEmotions    | 项目源码 | 文件源码
def __init__(self, root, fileids,
                 sep='/', word_tokenizer=WhitespaceTokenizer(),
                 sent_tokenizer=RegexpTokenizer('\n', gaps=True),
                 alignedsent_block_reader=read_alignedsent_block,
                 encoding='latin1'):
        """
        Construct a new Aligned Corpus reader for a set of documents
        located at the given root directory.  Example usage:

            >>> root = '/...path to corpus.../'
            >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP

        :param root: The root directory for this corpus.
        :param fileids: A list or regexp specifying the fileids in this corpus.
        """
        CorpusReader.__init__(self, root, fileids, encoding)
        self._sep = sep
        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer
        self._alignedsent_block_reader = alignedsent_block_reader
项目:TensorGraph    作者:hycis    | 项目源码 | 文件源码
def tweets(word_len, sent_len, train_valid_ratio=[5,1]):
    df = pandas.read_csv('tweets_large.csv')
    field = 'text'
    label = 'label'
    tokenizer = RegexpTokenizer(r'\w+')

    # encode characters into numbers
    encoder = CharNumberEncoder(df[field].values, tokenizer=tokenizer,
                                word_len=word_len, sent_len=sent_len)
    encoder.build_char_map()
    encode_X = encoder.make_char_embed()

    # encode categories into one hot array
    cat_encoder = CatNumberEncoder(df[label])
    cat_encoder.build_cat_map()

    encode_y = cat_encoder.make_cat_embed()
    nclass = len(np.unique(encode_y))
    encode_y = make_one_hot(encode_y, nclass)

    return encode_X, encode_y, nclass
项目:AirbnbReviewAnalyzer    作者:mrsata    | 项目源码 | 文件源码
def analysis(reviews_collection_text):
    with open('data/reviews_%s' % reviews_collection_text, 'r') as f:
        raw_data = f.read()
    with open('data/reviews_%s' % reviews_collection_text, 'r') as f:
        comments = f.readlines()
    data = raw_data.replace('\n', ' ')
    data_lower = data.lower()
    tokens_with_punc = word_tokenize(data_lower)
    tokens = RegexpTokenizer(r'\w+').tokenize(data_lower)
    print("--- Most frequent tokens ---\n",
        FreqDist(tokens_with_punc).most_common(15))
    print("--- Tokens without punctuation ---\n",
        FreqDist(tokens).most_common(15))
    stop = set(stopwords.words('english'))
    words = [word for word in tokens if word not in stop]
    print("--- Most frequent words ---\n", FreqDist(words).most_common(15))
    tagged = pos_tag(words)
    nouns = [word for word, pos in tagged if (pos == 'NN')]
    print("--- Most frequent nouns ---\n", FreqDist(nouns).most_common(15))
    adjts = [word for word, pos in tagged if (pos == 'JJ')]
    print("--- Most frequent adjective ---\n", FreqDist(adjts).most_common(15))
    tokns = [RegexpTokenizer(r'\w+').tokenize(comment) for comment in comments]
    lxdst = [lexical_density(token) for token in tokns if len(token) > 0]
    avgld = sum(lxdst) / len(comments)
    print("--- Average lexical density ---\n", avgld)
项目:search_relevance    作者:rmanak    | 项目源码 | 文件源码
def __init__(self, fname):
        words_map = {}
        for line in csv.reader(open(fname)):
            word, syn = line
            if word.startswith('#'):
                continue
            words_map[word] = syn
        super(CSVWordReplacer, self).__init__(words_map)


######### for now just a wrapper to RegexpTokenizer #########
项目:search_relevance    作者:rmanak    | 项目源码 | 文件源码
def __init__(self,pattern):
        self.pattern = pattern 
        self.tokenizer = RegexpTokenizer(self.pattern)



######## defining a default stopwords set #############
项目:kaggle-review    作者:daxiongshu    | 项目源码 | 文件源码
def rm_punctuation(data,pattern=r'[a-zA-Z]+-?[0-9]*',silent=1):
    if silent==0:
        print("remove punctuation ...")
    from nltk.tokenize import RegexpTokenizer
    tokenizer = RegexpTokenizer(pattern)
    return tokenizer.tokenize(" ".join(data))
项目:Python-Machine-Learning-Cookbook    作者:PacktPublishing    | 项目源码 | 文件源码
def __init__(self):
        # Create a regular expression tokenizer
        self.tokenizer = RegexpTokenizer(r'\w+')

        # get the list of stop words 
        self.stop_words_english = stopwords.words('english')

        # Create a Snowball stemmer 
        self.stemmer = SnowballStemmer('english')

    # Tokenizing, stop word removal, and stemming
项目:AcronymExpansion    作者:adityathakker    | 项目源码 | 文件源码
def get_list():
    stop_words = set(stopwords.words('english'))

    filename = 'data/new_acronyms.json'
    f = open(filename, 'r')
    data = json.load(f)
    paragraph_list = []
    full_form_list = []
    for k,v in data.items():
        if k=="WDM":
            for poss in v['possibilities']:
                paragraph_list.append(poss['summary'])
                full_form_list.append(poss['full_form'])
    s="two devices can also function as an add/drop multiplexer (ADM), i.e. simultaneously adding light beams while dropping other light beams and rerouting them to other destinations and devices. Formerly, such filtering of light beams was done with etalons, devices called Fabry–Pérot interferometers using thin-film-coated optical glass. The first WDM technology was conceptualized in the early 1970s and realized in the laboratory in the late 1970s; but these only combined two signals, and many years later were still very expensive.As of 2011, WDM systems can handle 160 signals, which will expand a 10 Gbit/second system with a single fiber optic pair of conductors to more than 1.6 Tbit/second (i.e. 1,600 Gbit/s).Typical WDM systems use single-mode optical fiber (SMF); this is optical fiber for only a single ray of light and having a core diameter of 9 millionths of a meter (9 µm). Other systems with multi-mode fiber cables (MM Fiber; also called premises cables) have core diameters of about 50 µm. Standardization and extensive research have brought down system costs significantly."
    paragraph_list.append(s)
    full_form_list.append("Wavelength context")
    texts = []
    taggeddoc = []
    p_stemmer = PorterStemmer()
    tokeniser = RegexpTokenizer(r'\w+')

    for index, para in enumerate(paragraph_list):
        raw = para.lower()

        tokens = tokeniser.tokenize(raw)
        stopped_tokens = [t for t in tokens if not t in stop_words]

        number_tokens = [x for x in stopped_tokens if x.isalpha]
        stemmed_tokens = [p_stemmer.stem(i) for i in number_tokens]

        length_tokens = [i for i in stemmed_tokens if len(i) > 1]
        texts.append(length_tokens)
        td = TaggedDocument(' '.join(stemmed_tokens).split(), [full_form_list[index]])

        taggeddoc.append(td)

    return taggeddoc
项目:QProb    作者:quant-trade    | 项目源码 | 文件源码
def get_summarized(self, input_data, num_sentences):
        # TODO: allow the caller to specify the tokenizer they want
        # TODO: allow the user to specify the sentence tokenizer they want
        # TODO multilingual!

        tokenizer = RegexpTokenizer('\w+')

        stopwords_ = [smart_text(word) for word in stopwords.words('english')]

        # get the frequency of each word in the input
        base_words = [smart_text(word.lower()) for word in tokenizer.tokenize(smart_text(input_data))]
        words = [smart_text(word) for word in base_words if word not in stopwords_]
        word_frequencies = FreqDist(words)

        # now create a set of the most frequent words
        most_frequent_words = [pair[0] for pair in list(word_frequencies.items())[:100]]

        # break the input up into sentences.  working_sentences is used
        # for the analysis, but actual_sentences is used in the results
        # so capitalization will be correct.

        sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
        actual_sentences = sent_detector.tokenize(input_data)
        working_sentences = [sentence.lower() for sentence in actual_sentences]

        # iterate over the most frequent words, and add the first sentence
        # that inclues each word to the result.
        output_sentences = []

        for word in most_frequent_words:
            for i in range(0, len(working_sentences)):
                if (word in working_sentences[i] and actual_sentences[i] not in output_sentences):
                    output_sentences.append(actual_sentences[i])
                    break
                if len(output_sentences) >= num_sentences: break
            if len(output_sentences) >= num_sentences: break

        # sort the output sentences back to their original order
        return self.reorder_sentences(output_sentences=output_sentences, input_data=input_data)
项目:Price-Comparator    作者:Thejas-1    | 项目源码 | 文件源码
def __init__(self, rtepair, stop=True, lemmatize=False):
        """
        :param rtepair: a ``RTEPair`` from which features should be extracted
        :param stop: if ``True``, stopwords are thrown away.
        :type stop: bool
        """
        self.stop = stop
        self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'to', 'is',
                              'have', 'are', 'were', 'and', 'very', '.', ','])

        self.negwords = set(['no', 'not', 'never', 'failed', 'rejected',
                             'denied'])
        # Try to tokenize so that abbreviations like U.S.and monetary amounts
        # like "$23.00" are kept as tokens.
        from nltk.tokenize import RegexpTokenizer
        tokenizer = RegexpTokenizer('([A-Z]\.)+|\w+|\$[\d\.]+')

        #Get the set of word types for text and hypothesis
        self.text_tokens = tokenizer.tokenize(rtepair.text)
        self.hyp_tokens = tokenizer.tokenize(rtepair.hyp)
        self.text_words = set(self.text_tokens)
        self.hyp_words = set(self.hyp_tokens)

        if lemmatize:
            self.text_words = set(lemmatize(token) for token in self.text_tokens)
            self.hyp_words = set(lemmatize(token) for token in self.hyp_tokens)

        if self.stop:
            self.text_words = self.text_words - self.stopwords
            self.hyp_words = self.hyp_words - self.stopwords

        self._overlap = self.hyp_words & self.text_words
        self._hyp_extra = self.hyp_words - self.text_words
        self._txt_extra = self.text_words - self.hyp_words
项目:Price-Comparator    作者:Thejas-1    | 项目源码 | 文件源码
def __init__(self, root, items, encoding='utf8'):
        gaps_re = r'(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*'
        sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True)
        TaggedCorpusReader.__init__(self, root, items, sep='_',
                                    sent_tokenizer=sent_tokenizer)

#: A list of all documents and their titles in ycoe.
项目:dong_iccv_2017    作者:woozzu    | 项目源码 | 文件源码
def split_sentence_into_words(sentence):
    tokenizer = RegexpTokenizer(r'\w+')
    return tokenizer.tokenize(sentence.lower())
项目:w2vec-similarity    作者:jayantj    | 项目源码 | 文件源码
def remove_punctuation(str):
  tokenizer = RegexpTokenizer(r'\w+')
  return tokenizer.tokenize(str)
项目:tensorlm    作者:batzner    | 项目源码 | 文件源码
def tokenize(text, level):
    """Tokenize a text into a list of strings.

    Args:
        text (str): An arbitrary string.
        level (str): Either "char" or "word". For "char", the string is split into characters. For
            "word", letters and numbers are glued to themselves and everything else is split.
            Example: "asdf df!?123 as12" -> "asdf", " ", "df", "!", "?", "123", " ", "as", "12"

    Returns:
        list[str]: The tokens

    Raises:
        ValueError: If the level is not "char" or "word"
    """

    if level == "char":
        # No need for tokenizing
        return list(text)
    elif level == "word":
        # Tokenize while keeping indentation. Glue letters and numbers to themselves but
        # keep all other chars isolated.
        tokenizer = RegexpTokenizer(r'\w+|\S|\s')
        return tokenizer.tokenize(text)
    else:
        raise ValueError("Unknown token level: {}".format(level))
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def __init__(self, rtepair, stop=True, lemmatize=False):
        """
        :param rtepair: a ``RTEPair`` from which features should be extracted
        :param stop: if ``True``, stopwords are thrown away.
        :type stop: bool
        """
        self.stop = stop
        self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'to', 'is',
                              'have', 'are', 'were', 'and', 'very', '.', ','])

        self.negwords = set(['no', 'not', 'never', 'failed', 'rejected',
                             'denied'])
        # Try to tokenize so that abbreviations like U.S.and monetary amounts
        # like "$23.00" are kept as tokens.
        from nltk.tokenize import RegexpTokenizer
        tokenizer = RegexpTokenizer('([A-Z]\.)+|\w+|\$[\d\.]+')

        #Get the set of word types for text and hypothesis
        self.text_tokens = tokenizer.tokenize(rtepair.text)
        self.hyp_tokens = tokenizer.tokenize(rtepair.hyp)
        self.text_words = set(self.text_tokens)
        self.hyp_words = set(self.hyp_tokens)

        if lemmatize:
            self.text_words = set(lemmatize(token) for token in self.text_tokens)
            self.hyp_words = set(lemmatize(token) for token in self.hyp_tokens)

        if self.stop:
            self.text_words = self.text_words - self.stopwords
            self.hyp_words = self.hyp_words - self.stopwords

        self._overlap = self.hyp_words & self.text_words
        self._hyp_extra = self.hyp_words - self.text_words
        self._txt_extra = self.text_words - self.hyp_words
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def __init__(self, root, items, encoding='utf8'):
        gaps_re = r'(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*'
        sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True)
        TaggedCorpusReader.__init__(self, root, items, sep='_',
                                    sent_tokenizer=sent_tokenizer)

#: A list of all documents and their titles in ycoe.
项目:tRECS    作者:TeeOhh    | 项目源码 | 文件源码
def remove_punc(string):
    # '''Description: This function takes in a string of descriptions and return a tokenized string without punctuation
    #   Parameters: String of descriptions
    #   Output: Tokenized string with punctuation removed'''
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(string)
    return " ".join(tokens)
项目:neighborhood_mood_aws    作者:jarrellmark    | 项目源码 | 文件源码
def __init__(self, rtepair, stop=True, lemmatize=False):
        """
        :param rtepair: a ``RTEPair`` from which features should be extracted
        :param stop: if ``True``, stopwords are thrown away.
        :type stop: bool
        """
        self.stop = stop
        self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'to', 'is',
                              'have', 'are', 'were', 'and', 'very', '.', ','])

        self.negwords = set(['no', 'not', 'never', 'failed', 'rejected',
                             'denied'])
        # Try to tokenize so that abbreviations like U.S.and monetary amounts
        # like "$23.00" are kept as tokens.
        from nltk.tokenize import RegexpTokenizer
        tokenizer = RegexpTokenizer('([A-Z]\.)+|\w+|\$[\d\.]+')

        #Get the set of word types for text and hypothesis
        self.text_tokens = tokenizer.tokenize(rtepair.text)
        self.hyp_tokens = tokenizer.tokenize(rtepair.hyp)
        self.text_words = set(self.text_tokens)
        self.hyp_words = set(self.hyp_tokens)

        if lemmatize:
            self.text_words = set(lemmatize(token) for token in self.text_tokens)
            self.hyp_words = set(lemmatize(token) for token in self.hyp_tokens)

        if self.stop:
            self.text_words = self.text_words - self.stopwords
            self.hyp_words = self.hyp_words - self.stopwords

        self._overlap = self.hyp_words & self.text_words
        self._hyp_extra = self.hyp_words - self.text_words
        self._txt_extra = self.text_words - self.hyp_words
项目:neighborhood_mood_aws    作者:jarrellmark    | 项目源码 | 文件源码
def __init__(self, root, items, encoding='utf8'):
        gaps_re = r'(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*'
        sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True)
        TaggedCorpusReader.__init__(self, root, items, sep='_',
                                    sent_tokenizer=sent_tokenizer)

#: A list of all documents and their titles in ycoe.
项目:RottenCrawler    作者:kevin940726    | 项目源码 | 文件源码
def getAllReviews(movieList):
    reviews = np.array(map(lambda x: x["reviews"], movieList))
    reviews = np.concatenate(reviews)

    tokenizeReview = []

    for review in reviews:
        s = review['review']
        s = RegexpTokenizer(r'\w+').tokenize(s.lower())
        s = map(lambda x: PorterStemmer().stem(x), s)
        s = filter(lambda x: x not in stopwords.words('english'), s)
        tokenizeReview.append((s, 'pos' if review["score"] >= 30 else 'neg'))

    return tokenizeReview
项目:RottenCrawler    作者:kevin940726    | 项目源码 | 文件源码
def getAllCritics(movieList):
    reviews = np.array(map(lambda x: x["critics"], movieList))
    reviews = np.concatenate(reviews)

    tokenizeReview = []

    for review in reviews:
        s = review['review']
        s = RegexpTokenizer(r'\w+').tokenize(s.lower())
        s = map(lambda x: PorterStemmer().stem(x), s)
        s = filter(lambda x: x not in stopwords.words('english'), s)
        tokenizeReview.append((s, 'pos' if review["tomatometer"] == "fresh" else 'neg'))

    return tokenizeReview
项目:hate-to-hugs    作者:sdoran35    | 项目源码 | 文件源码
def __init__(self, root, items, encoding='utf8'):
        gaps_re = r'(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*'
        sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True)
        TaggedCorpusReader.__init__(self, root, items, sep='_',
                                    sent_tokenizer=sent_tokenizer)

#: A list of all documents and their titles in ycoe.
项目:FancyWord    作者:EastonLee    | 项目源码 | 文件源码
def __init__(self, rtepair, stop=True, lemmatize=False):
        """
        :param rtepair: a ``RTEPair`` from which features should be extracted
        :param stop: if ``True``, stopwords are thrown away.
        :type stop: bool
        """
        self.stop = stop
        self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'to', 'is',
                              'have', 'are', 'were', 'and', 'very', '.', ','])

        self.negwords = set(['no', 'not', 'never', 'failed', 'rejected',
                             'denied'])
        # Try to tokenize so that abbreviations like U.S.and monetary amounts
        # like "$23.00" are kept as tokens.
        from nltk.tokenize import RegexpTokenizer
        tokenizer = RegexpTokenizer('([A-Z]\.)+|\w+|\$[\d\.]+')

        #Get the set of word types for text and hypothesis
        self.text_tokens = tokenizer.tokenize(rtepair.text)
        self.hyp_tokens = tokenizer.tokenize(rtepair.hyp)
        self.text_words = set(self.text_tokens)
        self.hyp_words = set(self.hyp_tokens)

        if lemmatize:
            self.text_words = set(lemmatize(token) for token in self.text_tokens)
            self.hyp_words = set(lemmatize(token) for token in self.hyp_tokens)

        if self.stop:
            self.text_words = self.text_words - self.stopwords
            self.hyp_words = self.hyp_words - self.stopwords

        self._overlap = self.hyp_words & self.text_words
        self._hyp_extra = self.hyp_words - self.text_words
        self._txt_extra = self.text_words - self.hyp_words
项目:FancyWord    作者:EastonLee    | 项目源码 | 文件源码
def __init__(self, root, items, encoding='utf8'):
        gaps_re = r'(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*'
        sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True)
        TaggedCorpusReader.__init__(self, root, items, sep='_',
                                    sent_tokenizer=sent_tokenizer)

#: A list of all documents and their titles in ycoe.
项目:Data-Mining-From-Twitter    作者:N1h1l1sT    | 项目源码 | 文件源码
def get_vocabulary(doc_set):
    tokenizer = RegexpTokenizer(r'\w+')
    distinctwords = {}
    i = 0
    # loop through document list
    for text in doc_set:
        raw = text.lower()
        tokens = tokenizer.tokenize(raw)
        for word in tokens:
            if word not in distinctwords:
                distinctwords[word] = i
                i += 1
    return distinctwords
项目:Data-Mining-From-Twitter    作者:N1h1l1sT    | 项目源码 | 文件源码
def get_frequency_table(titles, vocab):
    tokenizer = RegexpTokenizer(r'\w+')
    freqtable = np.ndarray(shape=(len(titles),len(vocab)), dtype=int, order='C')
    freqtable.fill(0)
    for i in range(0,len(titles)):
        raw = titles[i].lower()
        tokens = tokenizer.tokenize(raw)
        for token in tokens:
            index = vocab[token]
            freqtable[i][index] +=1
    return freqtable
项目:nlp_experiments    作者:amirhdrz    | 项目源码 | 文件源码
def _test(text=None):
    if text is not None:
        text = "What are the prerequisites for csc369?\n"


    tokenizer = RegexpTokenizer('[\w\d]+')
    word_tokens = tokenizer.tokenize(query)
    # TODO: how to tokenize 'u of t' and 'uoft'
    # TODO: use Bing Spell Check API
    tokens = [Token(tk) for tk in word_tokens]
    ser = RegexpEntityRecognizer()
    ser.process(tokens)
项目:beepboop    作者:nicolehe    | 项目源码 | 文件源码
def __init__(self, rtepair, stop=True, lemmatize=False):
        """
        :param rtepair: a ``RTEPair`` from which features should be extracted
        :param stop: if ``True``, stopwords are thrown away.
        :type stop: bool
        """
        self.stop = stop
        self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'to', 'is',
                              'have', 'are', 'were', 'and', 'very', '.', ','])

        self.negwords = set(['no', 'not', 'never', 'failed', 'rejected',
                             'denied'])
        # Try to tokenize so that abbreviations like U.S.and monetary amounts
        # like "$23.00" are kept as tokens.
        from nltk.tokenize import RegexpTokenizer
        tokenizer = RegexpTokenizer('([A-Z]\.)+|\w+|\$[\d\.]+')

        #Get the set of word types for text and hypothesis
        self.text_tokens = tokenizer.tokenize(rtepair.text)
        self.hyp_tokens = tokenizer.tokenize(rtepair.hyp)
        self.text_words = set(self.text_tokens)
        self.hyp_words = set(self.hyp_tokens)

        if lemmatize:
            self.text_words = set(lemmatize(token) for token in self.text_tokens)
            self.hyp_words = set(lemmatize(token) for token in self.hyp_tokens)

        if self.stop:
            self.text_words = self.text_words - self.stopwords
            self.hyp_words = self.hyp_words - self.stopwords

        self._overlap = self.hyp_words & self.text_words
        self._hyp_extra = self.hyp_words - self.text_words
        self._txt_extra = self.text_words - self.hyp_words
项目:beepboop    作者:nicolehe    | 项目源码 | 文件源码
def __init__(self, root, items, encoding='utf8'):
        gaps_re = r'(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*'
        sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True)
        TaggedCorpusReader.__init__(self, root, items, sep='_',
                                    sent_tokenizer=sent_tokenizer)

#: A list of all documents and their titles in ycoe.
项目:kind2anki    作者:prz3m    | 项目源码 | 文件源码
def __init__(self, rtepair, stop=True, lemmatize=False):
        """
        :param rtepair: a ``RTEPair`` from which features should be extracted
        :param stop: if ``True``, stopwords are thrown away.
        :type stop: bool
        """
        self.stop = stop
        self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'to', 'is',
                              'have', 'are', 'were', 'and', 'very', '.', ','])

        self.negwords = set(['no', 'not', 'never', 'failed', 'rejected',
                             'denied'])
        # Try to tokenize so that abbreviations like U.S.and monetary amounts
        # like "$23.00" are kept as tokens.
        from nltk.tokenize import RegexpTokenizer
        tokenizer = RegexpTokenizer('([A-Z]\.)+|\w+|\$[\d\.]+')

        #Get the set of word types for text and hypothesis
        self.text_tokens = tokenizer.tokenize(rtepair.text)
        self.hyp_tokens = tokenizer.tokenize(rtepair.hyp)
        self.text_words = set(self.text_tokens)
        self.hyp_words = set(self.hyp_tokens)

        if lemmatize:
            self.text_words = set(lemmatize(token) for token in self.text_tokens)
            self.hyp_words = set(lemmatize(token) for token in self.hyp_tokens)

        if self.stop:
            self.text_words = self.text_words - self.stopwords
            self.hyp_words = self.hyp_words - self.stopwords

        self._overlap = self.hyp_words & self.text_words
        self._hyp_extra = self.hyp_words - self.text_words
        self._txt_extra = self.text_words - self.hyp_words
项目:kind2anki    作者:prz3m    | 项目源码 | 文件源码
def __init__(self, root, items, encoding='utf8'):
        gaps_re = r'(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*'
        sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True)
        TaggedCorpusReader.__init__(self, root, items, sep='_',
                                    sent_tokenizer=sent_tokenizer)

#: A list of all documents and their titles in ycoe.
项目:but_sentiment    作者:MixedEmotions    | 项目源码 | 文件源码
def __init__(self, rtepair, stop=True, lemmatize=False):
        """
        :param rtepair: a ``RTEPair`` from which features should be extracted
        :param stop: if ``True``, stopwords are thrown away.
        :type stop: bool
        """
        self.stop = stop
        self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'to', 'is',
                              'have', 'are', 'were', 'and', 'very', '.', ','])

        self.negwords = set(['no', 'not', 'never', 'failed', 'rejected',
                             'denied'])
        # Try to tokenize so that abbreviations like U.S.and monetary amounts
        # like "$23.00" are kept as tokens.
        from nltk.tokenize import RegexpTokenizer
        tokenizer = RegexpTokenizer('([A-Z]\.)+|\w+|\$[\d\.]+')

        #Get the set of word types for text and hypothesis
        self.text_tokens = tokenizer.tokenize(rtepair.text)
        self.hyp_tokens = tokenizer.tokenize(rtepair.hyp)
        self.text_words = set(self.text_tokens)
        self.hyp_words = set(self.hyp_tokens)

        if lemmatize:
            self.text_words = set(lemmatize(token) for token in self.text_tokens)
            self.hyp_words = set(lemmatize(token) for token in self.hyp_tokens)

        if self.stop:
            self.text_words = self.text_words - self.stopwords
            self.hyp_words = self.hyp_words - self.stopwords

        self._overlap = self.hyp_words & self.text_words
        self._hyp_extra = self.hyp_words - self.text_words
        self._txt_extra = self.text_words - self.hyp_words
项目:but_sentiment    作者:MixedEmotions    | 项目源码 | 文件源码
def __init__(self, root, items, encoding='utf8'):
        gaps_re = r'(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*'
        sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True)
        TaggedCorpusReader.__init__(self, root, items, sep='_',
                                    sent_tokenizer=sent_tokenizer)

#: A list of all documents and their titles in ycoe.
项目:moviegeek    作者:practical-recommender-systems    | 项目源码 | 文件源码
def tokenize(data):
        tokenizer = RegexpTokenizer(r'\w+')

        return [tokenizer.tokenize(d) for d in data]
项目:moviegeek    作者:practical-recommender-systems    | 项目源码 | 文件源码
def build_lda_model(self, data, docs, n_topics=5):

        texts = []
        tokenizer = RegexpTokenizer(r'\w+')
        for d in data:
            raw = d.lower()

            tokens = tokenizer.tokenize(raw)

            stopped_tokens = self.remove_stopwords(tokens)

            stemmed_tokens = stopped_tokens
            #stemmer = PorterStemmer()
            #stemmed_tokens = [stemmer.stem(token) for token in stopped_tokens]

            texts.append(stemmed_tokens)

        dictionary = corpora.Dictionary(texts)

        corpus = [dictionary.doc2bow(text) for text in texts]

        lda_model = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary,
                                                 num_topics=n_topics)

        index = similarities.MatrixSimilarity(corpus)

        self.save_lda_model(lda_model, corpus, dictionary, index)
        self.save_similarities(index, docs)

        return dictionary, texts, lda_model
项目:TextTopicNet    作者:lluisgomez    | 项目源码 | 文件源码
def preprocess(raw):
    # Initialize Tokenizer
    tokenizer = RegexpTokenizer(r'\w+')

    # Initialize Lemmatizer
    lemma = WordNetLemmatizer()

    # create English stop words list
    en_stop = get_stop_words('en')

    # Decode Wiki Markup entities and remove markup
    text = filter_wiki(raw)
    text = re.sub(filter_more, '', text)

    # clean and tokenize document string
    text = text.lower()
    tokens = tokenizer.tokenize(text)

    # remove stop words from tokens
    tokens = [i for i in tokens if not i in en_stop]

    # stem tokens
    tokens = [lemma.lemmatize(i) for i in tokens]

    # remove non alphabetic characters
    tokens = [re.sub(r'[^a-z]', '', i) for i in tokens]

    # remove unigrams and bigrams
    tokens = [i for i in tokens if len(i)>2]

    return tokens
项目:TextTopicNet    作者:lluisgomez    | 项目源码 | 文件源码
def preprocess_imageclef(raw):
    # Initialize Tokenizer
    tokenizer = RegexpTokenizer(r'\w+')

    # Initialize Lemmatizer
    lemma = WordNetLemmatizer()

    # create English stop words list
    en_stop = get_stop_words('en')

    # Decode Wiki Markup entities and remove markup
    text = filter_wiki(raw)
    text = re.sub(filter_more, '', text)

    # clean and tokenize document string
    text = text.lower()
    tokens = tokenizer.tokenize(text)

    # remove stop words from tokens
    tokens = [i for i in tokens if not i in en_stop]

    # stem tokens
    tokens = [lemma.lemmatize(i) for i in tokens]

    # remove non alphabetic characters
    tokens = [re.sub(r'[^a-z]', '', i) for i in tokens]

    # remove unigrams and bigrams
    tokens = [i for i in tokens if len(i)>2]

    return (tokens, text)
项目:TextTopicNet    作者:lluisgomez    | 项目源码 | 文件源码
def preprocess_wikidata(raw):
 # Initialize Tokenizer
    tokenizer = RegexpTokenizer(r'\w+')

    # Initialize Lemmatizer
    lemma = WordNetLemmatizer()

    # create English stop words list
    en_stop = get_stop_words('en')

    # Decode Wiki Markup entities and remove markup
    text = filter_wiki(raw)
    text = re.sub(filter_more, '', text)

    # clean and tokenize document string
    text = text.lower().split('../img/')[0]
    tokens = tokenizer.tokenize(text)

    # remove stop words from tokens
    tokens = [i for i in tokens if not i in en_stop]

    # stem tokens
    tokens = [lemma.lemmatize(i) for i in tokens]

    # remove non alphabetic characters
    tokens = [re.sub(r'[^a-z]', '', i) for i in tokens]

    # remove unigrams and bigrams
    tokens = [i for i in tokens if len(i)>2]

    return (tokens, text)
项目:TextTopicNet    作者:lluisgomez    | 项目源码 | 文件源码
def preprocess(raw):
    # Initialize Tokenizer
    tokenizer = RegexpTokenizer(r'\w+')

    # Initialize Lemmatizer
    lemma = WordNetLemmatizer()

    # create English stop words list
    en_stop = get_stop_words('en')

    # Decode Wiki Markup entities and remove markup
    text = filter_wiki(raw)
    text = re.sub(filter_more, '', text)

    # clean and tokenize document string
    text = text.lower()
    tokens = tokenizer.tokenize(text)

    # remove stop words from tokens
    tokens = [i for i in tokens if not i in en_stop]

    # stem tokens
    tokens = [lemma.lemmatize(i) for i in tokens]

    # remove non alphabetic characters
    tokens = [re.sub(r'[^a-z]', '', i) for i in tokens]

    # remove unigrams and bigrams
    tokens = [i for i in tokens if len(i)>2]

    return tokens
项目:DebateAnalysis    作者:Lingistic    | 项目源码 | 文件源码
def get_bigram_likelihood(statements, freq_filter=3, nbest=200):
    """
    Returns n (likelihood ratio) bi-grams from a group of documents
    :param        statements: list of strings
    :param output_file: output path for saved file
    :param freq_filter: filter for # of appearances in bi-gram
    :param       nbest: likelihood ratio for bi-grams
    """

    words = list()
    print 'Generating word list...'
    #tokenize sentence into words
    for statement in statements:
        # remove non-words
        tokenizer = RegexpTokenizer(r'\w+')
        words.extend(tokenizer.tokenize(statement))

    bigram_measures = nltk.collocations.BigramAssocMeasures()
    bigram_finder = BigramCollocationFinder.from_words(words)

    # only bi-grams that appear n+ times
    bigram_finder.apply_freq_filter(freq_filter)

    # TODO: use custom stop words
    bigram_finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in nltk.corpus.stopwords.words('english'))

    bigram_results = bigram_finder.nbest(bigram_measures.likelihood_ratio, nbest)

    return bigram_finder.score_ngrams(bigram_measures.likelihood_ratio)