Python unidecode 模块,unidecode() 实例源码

我们从Python开源项目中,提取了以下49个代码示例,用于说明如何使用unidecode.unidecode()

项目:word-classification    作者:vinsis    | 项目源码 | 文件源码
def createData():
    spwords = [unidecode(a.lower()) for a in set(nltk.corpus.cess_esp.words()) if len(a)>3]
    enwords = [a.lower() for a in set(nltk.corpus.brown.words()) if len(a)>3]
    jpwords = [unidecode(a) for a in jeita.words() if (len(unidecode(a)) and unidecode(a)[0].islower())]
    jpwords = [a for a in set(jpwords) if len(a)>3]
    # minLen = min(len(enwords), len(spwords), len(jpwords))

    featuresets = \
        [(createTupleDict(w,numChars),'English') for w in enwords] + \
        [(createTupleDict(w,numChars),'Spanish') for w in spwords] + \
        [(createTupleDict(w,numChars),'Japanese') for w in jpwords]

    random.shuffle(featuresets)

    l=int(len(featuresets)*0.8)

    training_set = featuresets[:l]
    testing_set = featuresets[l:]
    return (training_set, testing_set)
项目:oadoi    作者:Impactstory    | 项目源码 | 文件源码
def normalize_title(title):
    if not title:
        return ""

    # just first n characters
    response = title[0:500]

    # lowercase
    response = response.lower()

    # deal with unicode
    response = unidecode(unicode(response))

    # has to be before remove_punctuation
    # the kind in titles are simple <i> etc, so this is simple
    response = clean_html(response)

    # remove articles and common prepositions
    response = re.sub(ur"\b(the|a|an|of|to|in|for|on|by|with|at|from)\b", u"", response)

    # remove everything except alphas
    response = remove_everything_but_alphas(response)

    return response
项目:atoolbox    作者:liweitianux    | 项目源码 | 文件源码
def make_bib_key(self, db=None):
        """
        Generate the BibTeX key for this entry from BibTeX data
        """
        first_author = self.persons["author"][0]
        last_name = "".join(first_author.last_names)
        last_name = unidecode(last_name)
        last_name = re.sub(r"[ {}`'\"\\]", "", last_name)
        year = self.fields["year"]
        journal = self.ads_record.get_bibstem()
        bibkey = "".join([last_name, year, journal])
        if db and db.exists_key(bibkey):
            num = 2
            while db.exists_key(bibkey+str(num)):
                num += 1
            bibkey += str(num)
        logger.info("Generated BibTeX key: {0}".format(bibkey))
        self.bibkey = bibkey
项目:fb-page-chat-download    作者:eisenjulian    | 项目源码 | 文件源码
def scrape_thread_list(self, threads, count):
        for t in threads['data']:
            extra_params = (('&since=' + self.since) if self.since else '') + (('&until=' + self.until) if self.until else '')
            url = self.build_url('{}/messages?fields=from,created_time,message,shares,attachments&limit=400' + extra_params, t['id'])
            print("GET", unidecode.unidecode(t['participants']['data'][0]['name']), t['id'])

            thread = self.scrape_thread(url, [])
            if thread:
                self.writer.writerow({
                    # 'page_id': t['participants']['data'][1]['id'],
                    # 'page_name': t['participants']['data'][1]['name'],
                    # 'user_id': t['participants']['data'][0]['id'],
                    # 'user_name': t['participants']['data'][0]['name'],
                    'url': t['link'],
                })
            id_map = {p['id']: p['name'] for p in t['participants']['data']}
            for message in reversed(thread):
                message['from'] = id_map[message['from_id']]
                self.writer.writerow(message)

        next = threads.get('paging', {}).get('next', '')
        if next and count > 1:
            self.scrape_thread_list(requests.get(next).json(), count - 1)
项目:gpxupload    作者:Skippern    | 项目源码 | 文件源码
def test_objects(id, al=3, name=u"Default"):
    logger.info("Preparing to test the results for %s (%s/%s)", clean(name), al, id)
    if forceTrue:
        if id == forceTrueID:
            logger.error("Overriding test for %s", forceTrueID)
            return True
    testOB = nullShape
    if True:
        testOB = build_object(id,al,name)
        if track.within(testOB):
            logger.info(u"Track is within %s (%s/%s) place.BBOX(%s)/track.BBOX(%s)", clean(name), al, id, testOB.bounds, track.bounds )
            print u"Within {0} ({3}) ({2}/{1})".format(name, id, al, unidecode(unicode(clean(name))))
            return True
        elif track.intersects(testOB):
            logger.info(u"Track intersects with %s (%s/%s) place.BBOX(%s)/track.BBOX(%s)", clean(name), al, id, testOB.bounds, track.bounds )
            print u"Intersects {0} ({3}) ({2}/{1})".format(name, id, al, unidecode(unicode(clean(name))))
            return True
    logger.info("Rejecting %s (%s/%s) place.BBOX(%s)/track.BBOX(%s)!!!", clean(name), al, id, testOB.bounds, track.bounds )
    return False
项目:classgrade    作者:classgrade    | 项目源码 | 文件源码
def get_students(csv_file):
    """
    :param csv_file: csv file with list of students.\
        Each row contains: first_name, last_name, email
    :type csv_file: str
    :rtype: 2 lists existing_students and new_students [[username, email], ..]
    """

    with open(csv_file) as ff:
        reader = csv.reader(ff, delimiter=',')
        existing_students = []
        new_students = []
        for i, row in enumerate(reader):
            row = [unidecode(x.strip()) for x in row[:3]]
            username = "_".join(row[:2])
            username = username.replace(" ", "_")
            email = row[2]
            try:
                u = User.objects.get(username=username)
                Student.objects.get(user=u)
                existing_students.append([u.username, u.email])
            except ObjectDoesNotExist:
                new_students.append([username, email])
    return existing_students, new_students
项目:habilitacion    作者:GabrielBD    | 项目源码 | 文件源码
def http_quote(string):
    """
    Given a unicode string, will do its dandiest to give you back a
    valid ascii charset string you can use in, say, http headers and the
    like.
    """
    if isinstance(string, six.text_type):
        try:
            import unidecode
        except ImportError:
            pass
        else:
            string = unidecode.unidecode(string)
        string = string.encode('ascii', 'replace')
    # Wrap in double-quotes for ; , and the like
    string = string.replace(b'\\', b'\\\\').replace(b'"', b'\\"')
    return '"{0!s}"'.format(string.decode())
项目:django-electron-pdf    作者:namespace-ee    | 项目源码 | 文件源码
def http_quote(string):
    """
    Given a unicode string, will do its dandiest to give you back a
    valid ascii charset string you can use in, say, http headers and the
    like.
    """
    if isinstance(string, six.text_type):
        try:
            import unidecode
        except ImportError:
            pass
        else:
            string = unidecode.unidecode(string)
        string = string.encode('ascii', 'replace')
    # Wrap in double-quotes for ; , and the like
    string = string.replace(b'\\', b'\\\\').replace(b'"', b'\\"')
    return '"{0!s}"'.format(string.decode())
项目:word2vec_pipeline    作者:NIHOPA    | 项目源码 | 文件源码
def close_words(W, X, labels, top_n=6):
    '''
    Find words that are close to each label.
    W is a gensim.word2vec
    X is the document vectors.
    labels are predetermined cluster labels.
    '''

    L = []
    for label in np.unique(labels):
        label_idx = labels == label
        mu = X[label_idx].mean(axis=0)

        dist = W.wv.syn0.dot(mu)
        idx = np.argsort(dist)[::-1][:top_n]
        words = [W.wv.index2word[i] for i in idx]
        L.append(' '.join(words))

    # Map unicode to simple ASCII
    L = map(unidecode, L)

    # Remove _PHRASE
    L = map(lambda x: x.replace('PHRASE_', ''), L)

    return L
项目:chandl    作者:gebn    | 项目源码 | 文件源码
def make_filename(string):
    """
    Turn a string into something that can be safely used as a file or directory
    name.

    :param string: The string to convert.
    :return: The sanitised string.
    :raises ValueError: If string is None.
    """
    if string is None:
        raise ValueError('String cannot be None')

    safe = [' ', '.', '_', '-', '\'']
    joined = ''.join([c for c in unidecode.unidecode(string)
                      if c.isalnum() or c in safe]).strip()
    if not joined:
        raise ValueError('Filename would be empty')
    return joined
项目:sciz    作者:erk3    | 项目源码 | 文件源码
def reset_groups_conf(self, group_name=None):
        if group_name and isinstance(group_name, str) and group_name != '':
            if not isinstance(group_name, unicode):
                group_name = group_name.decode(sg.DEFAULT_CHARSET)
            flat_name = filter(str.isalnum, unidecode.unidecode(group_name.lower()))
            sg.logger.info('Reseting conf for group %s...' % flat_name)
            try:
                group = sg.db.session.query(GROUP).filter(GROUP.flat_name == flat_name).one()
                self.__push_group_conf(group, True)
            except NoResultFound as e:
                sg.logger.warning('No group %s, aborting reset confs...' % (flat_name))
        else:
            sg.logger.info('Reseting conf for all groups...')
            groups = sg.db.session.query(GROUP).all()
            for group in groups:
                self.__push_group_conf(group, True)

    # Routine for pushing conf to a group
项目:hamutils    作者:sq8kfh    | 项目源码 | 文件源码
def _create_field(self, record_node, field, data):
        if data == None:
            return
        l_field = field.lower()
        if l_field in adif_field:
            if adif_field[l_field] == 'D':
                tmp_data = data.strftime('%Y%m%d')
            elif adif_field[l_field] == 'T':
                tmp_data = data.strftime('%H%M%S')
            elif adif_field[l_field] == 'B':
                tmp_data = 'Y' if data else 'N'
            else:
                tmp_data = str(data)

            if l_field in adif_rev_utf_field:
                record_node.appendChild(self._create_node(adif_rev_utf_field[l_field], tmp_data))
            record_node.appendChild(self._create_node(l_field, unidecode(tmp_data)))
        elif l_field.startswith('app_'):
            tmp_data = str(data)
            record_node.appendChild(self._create_node(l_field, tmp_data))
        else:
            raise WriteError('unknown field: \'%s\'' % l_field)
项目:data_utilities    作者:fmv1992    | 项目源码 | 文件源码
def series_to_ascii(series):
    """Change columns to lowercase strings inplace.

    Arguments:
        series (pandas.Series): series to be modified.

    Returns:
        pandas.Series: series with lowercase and no symbols.

    """
    warnings.warn("Function will be deprecated because it is not used.",
                  category=DeprecationWarning)
    series = series.copy(True)
    series = series.apply(unidecode)
    series = series.str.lower()
    series = series.str.replace('[^a-zA-Z0-9_]', '_')

    return series
项目:datasurvey    作者:occrp    | 项目源码 | 文件源码
def scan(self):
        log.info('Cronos extract: %s', self.path_name)
        target_dir = os.environ.get('CRONOS_OUTDIR')
        if target_dir is None:
            log.warning('No CRONOS_OUTDIR is set.')
            return
        sub_dir = slugify(unidecode(self.path_name), '_')
        target_dir = os.path.join(target_dir, sub_dir)
        try:
            os.makedirs(target_dir)
        except:
            pass
        try:
            parse(self.real_path, target_dir)
        except Exception as ex:
            log.exception(ex)
项目:ingunict-bot    作者:gabrik    | 项目源码 | 文件源码
def professors_handler(bot, update):

    msg = update.message.text
    msg = msg.split(' ')

    if len(msg)>=2:
        professor_name = unidecode(" ".join(msg[1:]))
        if len(professor_name)>3:
            search_result = [professor for professor in professors if professor_name.upper() in professor['Nome'].upper()]

            if len(search_result)>0:
                bot.sendMessage(update.message.chat_id, text='Sono stati trovati %d professori '\
                                                                'con la tua ricerca' % len(search_result))
                descr=""
                for p in search_result:
                    descr += "Nome: %s\nQualifica: %s\nDipartimento: %s\n" % (p['Nome'], p['Qualifica'], p['Dipartimento'])
                    descr+= "Indirizzo: %s\nEmail: %s\nTelefono: %s\n" % (p['Indirizzo'], p['Email'], p['Telefono'])
                    descr+= "Sito: %s\nSSD: %s\n\n" % (p['Sito'], p['SSD'])
                bot.sendMessage(update.message.chat_id,text= descr)
            else:
                bot.sendMessage(update.message.chat_id, text='Professore non trovato')
        else:
            bot.sendMessage(update.message.chat_id, text='Inserisci almeno 4 caratteri per la ricerca')
    else:
        bot.sendMessage(update.message.chat_id, text="Devi inserire il professore su cui ottenere informazioni!\n/prof <nome cognome>")
项目:ingunict-bot    作者:gabrik    | 项目源码 | 文件源码
def classroom_handler(bot, update):
    msg = update.message.text
    msg = msg.split(' ')

    if len(msg)==2:
        insegnamento_name=unidecode(" ".join(msg[1:]))
        if len(insegnamento_name)>3:
            search_result=[insegnamento for insegnamento in classrooms if insegnamento_name.upper() in insegnamento['Nome'].upper()]
            if len(search_result)>0:
                bot.sendMessage(update.message.chat_id, text='Sono stati trovati %d insegnamenti con la tua ricerca' % len(search_result))
                descr=""
                for m in search_result:
                    doc=''.join([docente+'\n' for docente in m['Docenti']])
                    descr += "Nome: %s\nSemestre: %s\nCorso di Laurea: %s\n" % (m['Nome'], m['Semestre'], m['Corso di Laurea'])
                    descr+= "Anno: %s\nDocenti: %s\nSSD: %s\n" % (m['Anno'], doc, m['SSD'])
                    descr+= "CFU: %s\n\n" % (m['CFU'])
                bot.sendMessage(update.message.chat_id, text=descr)
            else:
                bot.sendMessage(update.message.chat_id, text='Insegnamento non trovato')
        else:
            bot.sendMessage(update.message.chat_id, text='Inserisci almeno 4 caratteri per la ricerca')
    else:
        bot.sendMessage(update.message.chat_id, text="Devi inserire l'insegnamento su cui ottenere informazioni!\n/insegnamento <nome>")
项目:ingunict-bot    作者:gabrik    | 项目源码 | 文件源码
def courses_handler(bot,update):
    msg = update.message.text
    msg = msg.split(' ')
    if len(msg)==2:
        nome_corso = unidecode(msg[1])
        if len(nome_corso)>3:

            search_result = [corso for corso in courses if nome_corso.upper() in corso['Denominazione'].upper()]

            if len(search_result)>0:
                bot.sendMessage(update.message.chat_id, text='Sono stati trovati %d corsi con la tua ricerca' % len(search_result))
                descr=""
                for corso in search_result:
                    descr+="Nome: %s\nID: %s\n" % (corso['Denominazione'], corso['ID'])
                    descr+="Codice: %s\nOrdinamento: %s\n Tipo: %s\n\n" % (corso['Codice'], corso['Ordinamento'], corso['Tipo'])
                bot.sendMessage(update.message.chat_id, text=descr)
            else:
                bot.sendMessage(update.message.chat_id, text='Corso non trovato')
        else:
            bot.sendMessage(update.message.chat_id, text='Inserisci almeno 4 caratteri per la ricerca')
    else:
        bot.sendMessage(update.message.chat_id, text="Devi inserire il corso su cui ottenere informazioni!\n/corso <nome>")
项目:ingunict-bot    作者:gabrik    | 项目源码 | 文件源码
def exams_handler(bot,update):
    msg = update.message.text
    msg = msg.split(' ')    
    if len(msg)==2:
        cds_id = unidecode(msg[1])
        search_result=[esame for esame in exams if cds_id==str(esame['CDS_ID'])]
        if len(search_result)>0:
            bot.sendMessage(update.message.chat_id, text='Sono stati trovati %d esami con la tua ricerca' % len(search_result))
            for esame in search_result:
                descr="Materia: %s\nData: %s\nOra: %s\n" % (esame['Insegnamento'], esame['Data'], esame['Ora'])
                descr+='Aula: %s\n Scaglione: %s\nTipo: %s\nTipo Appello:%s\n\n' % (esame['Aula'], esame['Scaglione'], esame['Tipo Esame'], esame['Appello'])           
                bot.sendMessage(update.message.chat_id, text=descr)
        else:
            bot.sendMessage(update.message.chat_id, text="Corso non trovato verifica di aver inserito l'id corretto")
    else:
        bot.sendMessage(update.message.chat_id, text="Inserisci l'id del corso, lo puoi conoscere usando il comando corsi")
项目:catchup4kodi    作者:catchup4kodi    | 项目源码 | 文件源码
def cleanUnicode(string):   
    try:
        try:
            #string = str(string)
            if isinstance(string, unicode):
                unicode_replaced_str = string.decode('utf-8')
            elif isinstance(string, str):
                unicode_replaced_str = string.decode('utf-8')
            import unidecode
            unicode_replaced_str = unidecode.unidecode(unicode_replaced_str)
            string = unicode_replaced_str

        except:
            pass

        fixed_string = unicodedata.normalize('NFKD', string).encode('ascii', 'ignore' )    
        return fixed_string
    except:
        return string

#interface:
项目:lug    作者:shellterlabs    | 项目源码 | 文件源码
def parse(self, text, company_name):
        soup = BeautifulSoup(text, 'lxml')
        lis = soup.findAll('li', {'class':'search-result'})

        for item in lis:
            name = item.find('span', {'class':'actor-name'})
            name = name.text if name else "??"
            occupation = item.find('p', {'class':'search-result__snippets'})
            occupation = occupation.text.replace('\n', ' ') if occupation else "??"
            try:
                print('[+] :: {} :: {}'.format(unidecode(name), unidecode(occupation)))
                self.filewrite('[+] :: {} :: {}\n'.format(unidecode(name), unidecode(occupation)))
            except Exception as e:
                print('[+] :: {} :: {}\n'.format(unidecode(name.encode('utf-8', 'replace')),
                                                 unidecode(occupation.encode('utf-8', 'replace'))))
                self.filewrite('[+] :: {} :: {}\n'.format(unidecode(name.encode('utf-8', 'replace')),
                                                          unidecode(occupation.encode('utf-8', 'replace'))))
项目:roulier    作者:akretion    | 项目源码 | 文件源码
def _normalize_coerce_zpl(self, value):
        """Sanitze input for ZPL.

        Remove ZPL ctrl caraters
        Remove accents
        """
        if not isinstance(value, basestring):
            return value

        ctrl_cars = [
            0xFE,  # Tilde ~
            0x5E,  # Caret ^
            0x1E,  # RS (^ substitution)
            0x10,  # DLE (~ substitution)
        ]
        val = unidecode(value)
        for ctrl in ctrl_cars:
            val = val.replace("%c" % ctrl, "")
        return val
项目:twic_close_reading    作者:jarmoza    | 项目源码 | 文件源码
def ConvertToPlainText_Chunks(self, p_output_dir, p_file_number, p_chunk=True, p_chunk_size=5000):

        file_name = self.GetFilename()
        file_ext = self.GetFileExtension()
        output_lines = self.GetPreparedLines()

        # Optional line chunking
        chunks = []
        if p_chunk:
            chunks = Utils_MalletInterpret.GetChunkedLines(output_lines, p_chunk_size)
        else:
            chunks.append(output_lines)

        # Write out files
        for index in range(len(chunks)):
            with open("{0}{1}_{2}_{3}{4}".format(p_output_dir, p_file_number, file_name, index, file_ext), 'w') as plaintext_output_file:
                for line in chunks[index]:
                    plaintext_output_file.write(unidecode(line) + u"\n")
            p_file_number += 1

        return len(chunks)
项目:SO-CAL    作者:sfu-discourse-lab    | 项目源码 | 文件源码
def output_preprocessed_data(self, json_input, file_name):
        '''
        Output preprocessed data into a file.
        :param json_input: json formatted data generated from function str_process
        :param file_name: output file name
        :return: None
        '''
        rows = []
        for sent in json_input['sentences']:
            parsed_sent = " ".join([t['originalText'] + "/" + t['pos'] for t in sent['tokens']])
            rows.append(parsed_sent)
        output_file_path = self.output_folder + '/' + file_name
        if os.path.exists(output_file_path):
            open(output_file_path, 'w').close()
        with open(output_file_path, 'a') as preprocessed_out:
            for r in rows:
                preprocessed_out.write(unidecode.unidecode(r) + "\n")
项目:Sentences-analysis    作者:sungminoh    | 项目源码 | 文件源码
def preprocess(post):
  # example
  # {(romeo and juliet 2013),(romeo and juliet),(douglas booth),(hailee steinfeld)}"
  # -> romeo and juliet 2013 romeo and juliet douglas booth hailee steinfeld
  print post
  # remove all punctuations
  post = PUNCTUATION.sub(' ', utils.to_unicode(post))

  # replace all emoji characters to '_EMOTICON_' and add space in between.
  post = EMOTICON.sub(' _emoticon_ ', post)

  # convert all special characters to ascii characters
  post = unidecode(post).decode('ascii', 'ignore')

  # remove all whitespace into single one
  post = WHITESPACE.sub(' ', post).strip()
  return utils.to_unicode(post)
项目:sbrt2017    作者:igormq    | 项目源码 | 文件源码
def _sanitize(self, text):
        # removing duplicated spaces
        text = ' '.join(text.split())

        # removing digits
        text = ''.join([c for c in text if not c.isdigit()])

        # removing accents
        text = unidecode(text)

        # removnig punctuations
        text = text.translate(
            string.maketrans("-'", '  ')).translate(None,
                                                    string.punctuation)

        # remove uppercase
        text = text.lower()

        return text
项目:fastpost    作者:kosc    | 项目源码 | 文件源码
def form_valid(self, form):
        form = PartialNewPostForm(self.request.POST)
        post = form.save(commit=False)
        post.author = self.request.user
        post.slug = unidecode(post.title)
        post.slug = slugify(post.slug)
        post.save()
        if self.request.POST['tags_field']:
            tags = self.request.POST['tags_field'].replace(', ', ',').split(',')
            for tag_name in tags:
                tag = Tag()
                tag.post = post
                tag.name = tag_name
                tag.save()
        self.success_url = "/post/" + post.slug
        return super(NewPostView, self).form_valid(form)
项目:Laima-Discord-Bot    作者:glouis    | 项目源码 | 文件源码
def search(keywords, lang):
    formated_keywords = [ unidecode.unidecode(keyword).lower() for keyword in keywords ]
    with model.laima_db.transaction():
        query = (model.CardText
            .select(model.CardText, model.CardData)
            .join(model.CardData)
            .switch(model.CardText)
            .where(model.CardText.lang == lang)
            .join(model.CardTextTag)
            .join(model.Tag)
            .where(model.Tag.name << formated_keywords)
            .group_by(model.CardText)
            .having(fn.Count(model.Tag.id) == len(keywords))
            .order_by(model.CardText.name))
        if query.exists():
            count = query.count()
            cards = [ card for card in query ]
            return cards, count
        else:
            return [], 0
项目:plugin.audio.tidal2    作者:arnesongit    | 项目源码 | 文件源码
def log(self, txt = '', level=xbmc.LOGDEBUG):
        ''' Log a text into the Kodi-Logfile '''
        try:
            if self.detailLevel > 0 or level == xbmc.LOGERROR:
                if self.detailLevel == 2 and level == xbmc.LOGDEBUG:
                    # More Logging
                    level = xbmc.LOGNOTICE
                elif self.detailLevel == 3 and (level == xbmc.LOGDEBUG or level == xbmc.LOGSEVERE):
                    # Complex Logging
                    level = xbmc.LOGNOTICE
                if level != xbmc.LOGSEVERE:
                    if isinstance(txt, unicode):
                        txt = unidecode(txt)
                    xbmc.log(b"[%s] %s" % (self.pluginName, txt), level) 
        except:
            xbmc.log(b"[%s] Unicode Error in message text" % self.pluginName, xbmc.LOGERROR)
项目:datagorri    作者:julhac    | 项目源码 | 文件源码
def create_file(path, list_to_save):
        f = open(path, 'w')

        headers = []

        for entry in list_to_save:
            for key, val in entry.items():
                if not key in headers:
                    headers.append(key)

        headline = ";".join(headers) + '\n'
        f.write(headline)

        for entry in list_to_save:
            line = ''
            for header in headers:
                if header in entry:
                    line += entry[header]
                line += ';'

            try:
                line = unidecode(line)
                f.write(line + "\n")
            except Exception as e:
                print(e)
项目:NLPre    作者:NIHOPA    | 项目源码 | 文件源码
def __call__(self, unicode_text):
        '''
        Runs the parser.

        Args:
            unicode_text: a unicode document
        Returns:
            text: An ascii equivalent of unicode_text
        '''

        return unidecode.unidecode(unicode_text)

# if __name__ == "__main__":
#    text = u"?-Helix ?-sheet ?? ?? ?? ?? ?? ??"
#    parser = unidecoder()
#    print(parser(text))
项目:NLPre    作者:NIHOPA    | 项目源码 | 文件源码
def __call__(self, unicode_text):
        '''
        Runs the parser.

        Args:
            unicode_text: a unicode document
        Returns:
            text: An ascii equivalent of unicode_text
        '''

        return unidecode.unidecode(unicode_text)

# if __name__ == "__main__":
#    text = u"?-Helix ?-sheet ?? ?? ?? ?? ?? ??"
#    parser = unidecoder()
#    print(parser(text))
项目:geoextract    作者:stadt-karlsruhe    | 项目源码 | 文件源码
def normalize(self, s):
        '''
        Normalize text.
        '''
        s = s.strip().lower()
        if self.to_ascii:
            s = unidecode(s)
        if self.rejoin_lines:
            s = re.sub(r'(\w-)\s*\n\s*', r'\1', s, flags=_RE_FLAGS)
        if self.remove_hyphens:
            s = re.sub(r'([^\W\d_])-+(?=[^\W\d_])', r'\1', s, flags=_RE_FLAGS)
        if self.remove_specials:
            s = re.sub(r'(\D|^)([^\w\s]|_)+(?=\D|$)', r'\1 ', s,
                       flags=_RE_FLAGS)
            s = re.sub(r'(\w)([^\w\s]|_)+\s+', r'\1 ', s, flags=_RE_FLAGS)
            s = re.sub(r'\s+([^\w\s]|_)+(?=\w)', r'\1 ', s, flags=_RE_FLAGS)
        for pattern, replacement in self.subs:
            s = re.sub(pattern, replacement, s, flags=_RE_FLAGS)
        if self._stemmer:
            callback = lambda m: self._stemmer.stem(m.group())
            s = re.sub(r'([^\W\d_]|-)+', callback, s, flags=_RE_FLAGS)
        s = re.sub(r'\s+', ' ', s, flags=_RE_FLAGS)
        return s.strip()
项目:czl-scrape    作者:code4romania    | 项目源码 | 文件源码
def get_type(self, text):
        text = unidecode(text).lower().strip()
        type = None

        stop_pos = re.search(r'(pentru|privind)', text).start()
        if stop_pos:
            text = text[0:stop_pos]

        if re.search(r'ordin', text):
            type = 'OM'

        if re.search(r'lege', text):
            type = 'LEGE'

        if re.search(r'hotarare', text):
            type = 'HG'

        if re.search(r'ordonanta', text):
            if re.search(r'urgenta', text):
                type = 'OUG'
            else:
                type = 'OG'

        return type
项目:czl-scrape    作者:code4romania    | 项目源码 | 文件源码
def get_feedback_date(self, text):
        formats = ['%d %B %Y', '%d.%m.%Y']
        text = unidecode(text.strip().lower())

        phrase = re.search(r'data limita.*((\d\d?\.\d\d?\.20\d\d)|(\d\d?\s[a-z]+\s20\d\d))', text)
        if phrase:
            date = re.search(r'(\d\d?\.\d\d?\.20\d\d)|(\d\d?\s[a-z]+\s20\d\d)', phrase.group(0))

            if date:
                date = date.group(0)
                for format in formats:
                    try:
                        result = datetime.datetime.strptime(date, format)
                        if result:
                            return result
                    except ValueError:
                        pass
项目:czl-scrape    作者:code4romania    | 项目源码 | 文件源码
def get_type(self, text):
        text = unidecode(text).lower().strip()
        type = None

        stop_pos = re.search(r'(pentru|privind)', text)
        if stop_pos:
            text = text[0:stop_pos.start()]

        if re.search(r'ordin', text):
            type = 'OM'

        if re.search(r'lege', text):
            type = 'LEGE'

        if re.search(r'hotarare', text):
            type = 'HG'

        if re.search(r'ordonanta', text):
            if re.search(r'urgenta', text):
                type = 'OUG'
            else:
                type = 'OG'

        return type
项目:pyjam    作者:10se1ucgo    | 项目源码 | 文件源码
def __init__(self, audio_dir=os.curdir, audio_rate=11025, mod_path=os.curdir,
                 name=None, play_key='F8', relay_key='=', use_aliases=True):
        """
        Args:
            audio_dir (str): Path for finding audio.
            audio_rate (int): The sample rate the game accepts.
            mod_path (str): Path to the mod folder (e.g. "Steam/SteamApps/common/Team Fortress 2/tf2")
            name (str): The name of the game.
            play_key (str): The key used to start/stop music in-game.
            relay_key (str): The key used to interact with the game.
            use_aliases (bool): Whether or not to use aliases to select songs in-game.
        """
        self.audio_dir = audio_dir
        self.audio_rate = audio_rate
        self.mod_path = mod_path
        self.name = unidecode.unidecode(name)
        self.play_key = play_key if bindable(play_key) else "F8"
        self.relay_key = relay_key if bindable(relay_key) else "="
        self.use_aliases = use_aliases
项目:Hanhan_NLP    作者:hanhanwu    | 项目源码 | 文件源码
def output_preprocessed_data(self, json_input, file_name):
        '''
        Output preprocessed data into a file.
        :param json_input: json formatted data generated from function str_process
        :param file_name: output file name
        :return: None
        '''

        rows = []
        for sent in json_input['sentences']:
            parsed_sent = " ".join([t['originalText'] + "/" + t['pos'] for t in sent['tokens']])
            rows.append(parsed_sent)
        output_file_path = self.output_folder + '/' + file_name
        with open(output_file_path, 'a') as preprocessed_out:
            for r in rows:
                preprocessed_out.write(unidecode.unidecode(r) + "\n")
项目:aio    作者:pavhofman    | 项目源码 | 文件源码
def parseToJsonStr(self, metadata: dict) -> Optional[str]:
        """
        :return: json string or None if no matching non-empty metadata found
        """
        jsonDict = {}
        for md, possibleKeys in self.__rulesDict.items():
            for key in possibleKeys:
                if key in metadata:
                    value = metadata.get(key)
                    if len(value) > 0:
                        jsonDict[md.value] = unidecode(value)
                        # found first value, skipping other possible keys for the metadata
                        break

        if len(jsonDict) > 0:
            return json.dumps(jsonDict)
        else:
            return None
项目:recobot    作者:h4ck3rk3y    | 项目源码 | 文件源码
def similar_users(user):
    if not type(user) is str:
        user = unidecode.unidecode(user)
    if db.done_users.find_one({'user':user})['recommended']==False:
        user_files = db.user_list.find({'user':user})
        f = open('./dc_recom.dat','a')
        for u in user_files:
            f.write(u['user'] + '::' + u['tth'])
            f.write('\n')
        f.close()
        db.done_users.update({'user': user}, {'user':user, 'recommended': True})

    data = Data()
    data.load('./dc_recom.dat', sep='::', format={'col':1,'row':0})
    svd = SVD()
    svd.set_data(data)
    svd.compute(k=1000,min_values=0, pre_normalize=None, mean_center=False, post_normalize=True)
    return [i[0] for i in svd.similar(user)]
项目:gender_classifier    作者:LU-C4i    | 项目源码 | 文件源码
def initialCheckName(self, name):
        '''Check if name is written in Cyrillic or Greek script, and transliterate'''
        if only_cyrillic_chars(name) or only_greek_chars(name):
            name = unidecode(name)

        '''Initial check for gender-specific words at the beginning of the name'''
        f = name.split()[0]
        if f in self.maleWords:
            conf = 1
            return ('male',conf)
        elif f in self.femaleWords:
            conf = 1
            return ('female', conf)

        '''Check for gender-specific words at the second part of the name'''
        if len(name.split())> 1:
            l = name.split()[1]
            if l in self.maleWords:
                conf = 1
                return ('male',conf)
            elif l in self.femaleWords:
                conf = 1
                return ('female', conf)
        return (None,0)
项目:indivisible    作者:danieltahara    | 项目源码 | 文件源码
def get_statements_by_person(self, first_name, last_name, limit=0):
        """
        Get statements and ratings by name.

        @param first_name: of MoC
        @param last_name: of MoC
        @param limit: optional limit
        @return: statements
        """
        limit = limit if limit > 0 else 10
        results = self._get(
            "statements/truth-o-meter/people/{first_name}-{last_name}/"
            "json/?n={limit}".format(first_name=unidecode(first_name.lower()),
                                     last_name=unidecode(last_name.lower()),
                                     limit=limit))
        return results if results else []
项目:telegram-yt_mp3-bot    作者:Javinator9889    | 项目源码 | 文件源码
def descarga(full_name):
    url = 'https://file.io/?expires=1w'
    files = {'file': open(full_name,'rb')}
    print("\n\tSubiendo archivo a 'file.io'")
    link = None
    n=0
    while link==None:                                                    # For ensuring that the file is uploaded correctly
        response = requests.post(url, files=files)
        test = response.text
        print("JSON recibido: ",test)
        decoded = unidecode(test)                                        # It's needed to decode text for avoiding 'bytes' problems (b'<meta...)
        print("JSON decodificado: ",decoded)
        if '<html>' in decoded:                                          # When upload fails, 'file.io' sends a message with <html> header.
            print("\n\tFallo al subir el archivo. Reintentando... #",n)  # If it's detected, assings 'link = None' and then 'while' loop restars
            link = None
            n=n+1                                                        # Little counter
        else:
            json_data = json.loads(decoded)
            link = json_data['link']
            print("\n\nEnlace de descarga directa: ",link)
    return link
项目:openrefine-wikidata    作者:wetneb    | 项目源码 | 文件源码
def fuzzy_match_strings(ref, val):
    """
    Returns the matching score of two values.
    """
    if not ref or not val:
        return 0
    ref_q = to_q(ref)
    val_q = to_q(val)
    if ref_q or val_q:
        return 100 if ref_q == val_q else 0
    simplified_val = unidecode(val).lower()
    simplified_ref = unidecode(ref).lower()

    # Return symmetric score
    r1 = fuzz.token_sort_ratio(simplified_val, simplified_ref)
    r2 = fuzz.token_sort_ratio(simplified_ref, simplified_val)
    r2 = r1
    return int(0.5*(r1+r2))
项目:TwentyTwo    作者:EPITECH-2022    | 项目源码 | 文件源码
def emoji(self, context):
        ''' Sends a text and replace letters with regional indicators '''
        from unidecode   import unidecode
        content = self.bot.get_text(context)
        if content in [None, '', ' '] or context.invoked_with == 'riz' and not self.bot.is_owner(context.message.author):
            return
        msg = ''
        if context.invoked_with in ['ri', 'bi']:
            msg += '`{}`: '.format(context.message.author)
        for c in content:
            if c.isalpha():
                b = context.invoked_with == 'bi' and c in ['b', 'B', 'p', 'P']
                if b:
                    msg += ':b:'
                else:
                    msg += ':regional_indicator_{}:'.format(unidecode(c.lower()))
            else:
                msg += c
        await self.bot.say(msg)
        await self.bot.replied(context)
        if context.invoked_with in ['ri', 'riz', 'bi']:
            try:
                await self.bot.delete_message(context.message)
            except discord.errors.Forbidden:
                pass
项目:Wagtail-Image-Folders    作者:anteatersa    | 项目源码 | 文件源码
def validate_folder(self):
        """Validates whether a folder can be created.
        Performs two types of validation:
        1. Checks if a DB entry is present.
        2. Checks if a physical folder exists in the system."""

        unicoded_title = "".join((i if ord(i) < 128 else '_') for i in unidecode(self.title))
        parent_folder = self.folder

        if parent_folder:
            if ImageFolder.objects.filter(folder=parent_folder, title=self.title).count() > 0:
                raise ValidationError("Folder exists in the DB!", code='db')
            folder_path = os.path.join(settings.MEDIA_ROOT, parent_folder.path, unicoded_title)
            if os.path.isdir(folder_path):
                raise ValidationError("Folder exists in the OS!", code='os')
        else:
            if ImageFolder.objects.filter(folder__isnull=True, title=self.title).count() > 0:
                raise ValidationError("Folder exists in the DB!", code='db')
            folder_path = os.path.join(settings.MEDIA_ROOT, IMAGES_FOLDER_NAME, unicoded_title)
            if os.path.isdir(folder_path):
                raise ValidationError("Folder exists in the OS!", code='os')
项目:Wagtail-Image-Folders    作者:anteatersa    | 项目源码 | 文件源码
def get_upload_to(self, filename):
        filename = self.file.field.storage.get_valid_name(filename)

        # do a unidecode in the filename and then
        # replace non-ascii characters in filename with _ , to sidestep issues with filesystem encoding
        filename = "".join((i if ord(i) < 128 else '_') for i in unidecode(filename))

        # Truncate filename so it fits in the 100 character limit
        # https://code.djangoproject.com/ticket/9893
        if self.folder:
            full_path = os.path.join(self.folder.path, filename)
        else:
            full_path = os.path.join(IMAGES_FOLDER_NAME, filename)

        if len(full_path) >= 95:
            chars_to_trim = len(full_path) - 94
            prefix, extension = os.path.splitext(filename)
            filename = prefix[:-chars_to_trim] + extension
            if self.folder:
                full_path = os.path.join(self.folder.path, filename)
            else:
                full_path = os.path.join(IMAGES_FOLDER_NAME, filename)
        return full_path
项目:openkamer    作者:openkamer    | 项目源码 | 文件源码
def get_members_missing(members_current, members_current_check):
    members_missing = []
    for member_check in members_current_check:
        found = False
        member_check_name = unidecode(member_check['name'])
        member_check_forename = unidecode(member_check['forename'])
        for member in members_current:
            member_name = unidecode(member.person.surname_including_prefix())
            if member_check_name == member_name and member_check_forename == unidecode(member.person.forename):
                found = True
                break
        if not found:
            members_missing.append(
                member_check['initials'] + ' ' + member_check['name'] + ' (' + member_check['forename'] + ')')
            # print(member_check['name'])
    return members_missing
项目:openkamer    作者:openkamer    | 项目源码 | 文件源码
def get_members_incorrect(members_current, members_current_check):
    members_incorrect = []
    for member in members_current:
        found = False
        member_name = unidecode(member.person.surname_including_prefix())
        member_forename = unidecode(member.person.forename)
        for member_check in members_current_check:
            member_check_name = unidecode(member_check['name'])
            member_check_forename = unidecode(member_check['forename'])
            if member_check_name == member_name and member_check_forename == member_forename:
                found = True
                break
        if not found:
            members_incorrect.append(member)
            # print(member.person.fullname())
    return members_incorrect
项目:openkamer    作者:openkamer    | 项目源码 | 文件源码
def find_party(name):
        name_ascii = unidecode(name)
        name_lid = 'Lid-' + name
        name_no_dash = name.replace('-', ' ')
        parties = PoliticalParty.objects.filter(name__iexact=name) \
                  | PoliticalParty.objects.filter(name__iexact=name_ascii) \
                  | PoliticalParty.objects.filter(name__iexact=name_lid) \
                  | PoliticalParty.objects.filter(name__iexact=name_no_dash)
        if parties.exists():
            return parties[0]
        parties = PoliticalParty.objects.filter(name_short__iexact=name) \
                  | PoliticalParty.objects.filter(name_short__iexact=name_ascii) \
                  | PoliticalParty.objects.filter(name_short__iexact=name_lid) \
                  | PoliticalParty.objects.filter(name_short__iexact=name_no_dash)
        if parties.exists():
            return parties[0]
        logger.warning('party not found: ' + name)
        return None