我们从Python开源项目中,提取了以下34个代码示例,用于说明如何使用html2text.html2text()。
def format_text(feed): """ Converts a html text to markdown and adds a bottom line to it :param feed: Feed to format :return: formatted text """ text = html2text.html2text(feed.desc) link = feed.link text = '[Link zum PietSmiet.de-Artikel](' + link + ')\n\n' + \ text + '\n\n--- \n[Code](https://github.com/PietsmietApp/pietsmiet_xposter) | ' + \ '*Auch als Push-Benachrichtigung in der [Community App für Pietsmiet](' \ 'https://play.google.com/store/apps/details?id=de.pscom.pietsmiet&referrer=utm_source%3Dreddit' \ '%26utm_medium%3Duploadplan)* ' return text
def send_email(to_address, subject, html_body): try: smtp = EmailService.create_smtp_server() message = mailer.Message( From=EmailService.__from_address, To=to_address, charset='utf-8') message.Subject = subject message.Html = html_body message.Body = html2text.html2text(html_body) if not EmailService.__is_debug_mode: print("Sending message (live!)") smtp.send(message) else: print("Skipping send, email is in dev mode.") except Exception as x: print("Error sending mail: {}".format(x))
def checkCalendarForUpcomingEvents(): """ Checks calendar for upcoming events """ nowDate = datetime.datetime.now() laterDate = nowDate + datetime.timedelta(minutes = calendarSettings['TimespanToCheck']) successful, res = showAgenda('', nowDate.strftime("%d.%m.%Y %H:%M"), laterDate.strftime("%d.%m.%Y %H:%M"), True) if successful: for item in res: eventContent = '### **{0}**\nTime: {1} - {2} (KIT time)\nDetails: {3}Location: {4}\n\n'.format(item.subject,item.start.astimezone(EWSTimeZone.timezone('Europe/Copenhagen')).strftime('%H:%M'),item.end.astimezone(EWSTimeZone.timezone('Europe/Copenhagen')).strftime('%H:%M'), html2text.html2text(item.body), item.location) for subcalendar in item.categories: try: mattermostHook.send(eventContent, channel=subcalendar) except Exception as e: messageContent = eventContent + '\n Error occured: \n {0} \n'.format(e.__doc__) mattermostHook.send(messageContent, channel=mattermostSettings['DefaultChannel'])
def dl_scripts(): url = BASE_URL r = requests.get(url) tree = BeautifulSoup(r.text, "html.parser") os.makedirs("plays", exist_ok=True) for a in tree.find_all("a")[2:-7]: link = a.get("href").split("/")[0] title = a.text.strip().replace(" ", "_") title = title.replace("\n", "_") fn = "plays/" + title + ".txt" r = requests.get(BASE_URL + "/" + link + "/full.html") body = html2text(r.text.replace("blockquote", "p")) body = body[body.index("### ACT I"):] with open(fn, "w") as f: f.write(body)
def build_mime_text(recipients, subject, message): """ Puts message data into MIME format :param recipients: array of email addresses to send email to :param subject: subject of email :param message: body of email :return MIMEMultipart object """ # Record the MIME types of text/plain and text/html. part1 = MIMEText(html2text.html2text(message), 'plain') part2 = MIMEText(message, 'html') # Attach parts into mime message container. body = MIMEMultipart('alternative') body['Subject'] = subject body['From'] = options.smtp_from body['To'] = ",".join(recipients) body.attach(part1) body.attach(part2) raise Return(body)
def staff_reminder(request): # pylint: disable=invalid-name if config.STAFF_EMAIL_REMINDER: request_type = type(request).__name__.lower() staff_url = "/email/template/{}/staff/reminder/".format( request_type ) context = { request_type: request, "protocol": "https", "site": Site.objects.get(id=SITE_ID), "FELLOWS_MANAGEMENT_EMAIL": config.FELLOWS_MANAGEMENT_EMAIL, } flatemail = FlatPage.objects.get(url=staff_url) template = Template(flatemail.content) jinja_context = Context(context) html = template.render(jinja_context) plain_text = html2text(html) mail_staffs( flatemail.title, plain_text, html_message=html, fail_silently=False )
def ENMLtoText(contentENML): soup = BeautifulSoup(contentENML.decode('utf-8')) for section in soup.select('li > p'): section.replace_with( section.contents[0] ) for section in soup.select('li > br'): if section.next_sibling: next_sibling = section.next_sibling.next_sibling if next_sibling: if next_sibling.find('li'): section.extract() else: section.extract() Editor.checklistInENMLtoSoup(soup) for section in soup.findAll('en-todo', checked='true'): section.replace_with('[x]') for section in soup.findAll('en-todo'): section.replace_with('[ ]') content = html2text.html2text(str(soup).decode('utf-8'), '', 0) content = re.sub(r' *\n', os.linesep, content) return content.encode('utf-8')
def markdown_db_migrate(): '''Perform a migration of the app long descriptions from HTML to Markdown for existing database records''' with app.app_context(): query = 'SELECT id, long_description FROM "app";' query_result = db.engine.execute(query) old_descriptions = query_result.fetchall() for old_desc in old_descriptions: if old_desc.long_description: new_description = html2text(old_desc.long_description) query = text(''' UPDATE app SET long_description=:long_description WHERE id=:id''') db.engine.execute(query, long_description = new_description, id = old_desc.id)
def _handle_anime(entry): embed = discord.Embed(title=entry.title.string) embed.url = BASE_URL_MYANIMELIST.format("anime", entry.id.string) embed.add_field(name="ID", value=entry.id.string) embed.add_field(name="Synonyms", value=entry.synonyms.string) embed.add_field(name="Episodes", value=entry.episodes.string) embed.add_field(name="Score", value=entry.score.string) embed.add_field(name="Type", value=entry.type.string) embed.add_field(name="Status", value=entry.status.string) embed.add_field(name="Start date", value=entry.start_date.string) embed.add_field(name="End date", value=entry.end_date.string) embed.description = html2text.html2text(entry.synopsis.string) return embed
def _handle_manga(entry): embed = discord.Embed(title=entry.title.string) embed.url = BASE_URL_MYANIMELIST.format("manga", entry.id.string) embed.add_field(name="ID", value=entry.id.string) embed.add_field(name="Synonyms", value=entry.synonyms.string) embed.add_field(name="Chapters", value=entry.chapters.string) embed.add_field(name="Volumes", value=entry.volumes.string) embed.add_field(name="Score", value=entry.score.string) embed.add_field(name="Type", value=entry.type.string) embed.add_field(name="Status", value=entry.status.string) embed.add_field(name="Start date", value=entry.start_date.string) embed.add_field(name="End date", value=entry.end_date.string) embed.description = html2text.html2text(entry.synopsis.string) return embed
def send_html_email(to_addr, **kwargs): data_dict = kwargs['data_dict'] subject_template = kwargs['subject_template'] email_template = kwargs['email_template'] email_tag = settings.EMAIL_TAG subject = "{} {}".format(email_tag, remove_newlines(render_to_string(subject_template, data_dict))) html_body = render_to_string(email_template, data_dict) text_body = html2text.html2text(html_body) send_mail(subject=subject, message=text_body, from_email=settings.DEFAULT_FROM_EMAIL, recipient_list=to_addr, fail_silently=True, html_message=html_body)
def load_active_text(soup): text_entry = soup.select("text[active=1]")[0] content = text_entry.find("content").get_text() content = BeautifulSoup(content, "lxml") for node in content.select("code a"): node.parent.unwrap() return text_entry["id"], html2text(str(content))
def add_localization(language, exercise_id, config_path): config = Config.load(Path.cwd() / (config_path or "import-config.yml")) api = ApiClient(config.api_url, config.api_token) exercise = api.get_exercise(exercise_id) exercise["localizedTexts"].append({ "locale": language, "text": html2text(sys.stdin.read()) }) api.update_exercise(exercise_id, exercise)
def get_website_languages(self,json_data): url_language_dictionary = {} url_count = 0 for article in json_data: for url in json_data[article]: url_count += 1 # print url_count if url in url_language_dictionary: continue # start a timeout counter signal.alarm(10) try: html = urllib.urlopen(url) encoding = html.headers.getparam('charset') if encoding is None: encoding = chardet.detect(html.read())['encoding'] encoded_html = unicode(html.read(),encoding , errors='replace') markup_text = html2text.html2text(encoded_html) html_from_markup = markdown(markup_text) text = ''.join(BeautifulSoup(html_from_markup,"lxml").findAll(text=True)) language = detect(text) url_language_dictionary[url] = language except TimeoutException: print "timeout for: " + url except Exception as exception: print "Continue after " + exception.__class__.__name__ + " for URL: " + url continue return url_language_dictionary
def converthtml2text(html): # build the flat text html2text.BODY_WIDTH = 0 html2text.IGNORE_ANCHORS = True html2text.IGNORE_IMAGES = True outstr = html2text.html2text(html) # html2text adds markup: | for bold, ** for italic, # for header, *** for hr - remove outstr = outstr.replace("|", "") outstr = outstr.replace("**", "") outstr = outstr.replace("# ", "") outstr = outstr.replace("* * *", "") # remove double spaces while True: filelen = len(outstr) outstr = outstr.replace(" ", " ") if filelen == len(outstr): break outstr = outstr.replace("\n ", "\n") outstr = outstr.replace(" \n", "\n") # remove empty lines while True: filelen = len(outstr) outstr = outstr.replace("\n\n", "\n") if filelen == len(outstr): break return outstr # sends to OCR a PDF file # the text file is stored in the folder targetpath # returns the path of the output txt file # uses Abby FineReader Hot folder # if text file already exists (previously OCR), does not OCR again # can be replaced with other method if necessary # returns a tuple # 1st element - operation code (ERROR, CREATED, EXISTS) # 2nd element - error message or ocr file path
def html2text(s): s = re.compile('</*en-media[^>]*?>').sub('', s) return h2t(s)
def process(input, entities): output = {} try: book_title = entities['book'][0]['value'] with requests_cache.enabled('book_cache', backend='sqlite', expire_after=86400): response = requests.get( 'https://www.goodreads.com/book/title.xml?key=' + GOODREADS_ACCESS_TOKEN + '&title=' + book_title) data = ElementTree.fromstring(response.content) book_node = data.find('book') author = book_node.find('authors').find('author').find('name').text title = book_node.find('title').text description = html2text(book_node.find('description').text) average_rating = book_node.find('average_rating').text link = book_node.find('link').text goodreads_attribution = '- Powered by Goodreads' template = TextTemplate() template.set_text('Title: ' + title + '\nAuthor: ' + author + '\nDescription: ' + description) template.set_post_text('\nAverage Rating: ' + average_rating + ' / 5' + '\n' + goodreads_attribution) text = template.get_text() template = ButtonTemplate(text) template.add_web_url('Goodreads Link', link) output['input'] = input output['output'] = template.get_message() output['success'] = True except: error_message = 'I couldn\'t find any book matching your query.' error_message += '\nPlease ask me something else, like:' error_message += '\n - book timeline' error_message += '\n - harry potter book plot' error_message += '\n - little women book rating' output['error_msg'] = TextTemplate(error_message).get_message() output['success'] = False return output
def format_text(html): text = html2text.html2text(html, bodywidth=0).replace('\n\n', '\n') text = HTMLParser.HTMLParser().unescape(text) text = strip_markdown.strip(text) text = re.sub(' +\n', '\n', text).strip() text = re.sub('\n\n\n', '\n\n', text) return text
def sentenceTokenize(): sentences = html2text.html2text(request.form['sentences']) result = nlp.sentenceTokenize(sentences) return buildResponse.sentPlainText(result)
def posTagAndLabel(): sentences = request.form['sentences'] cleanSentences = html2text.html2text(sentences) result = nlp.posTagAndLabel(cleanSentences) return buildResponse.buildJson(result)
def send(self, force=False): if not self.enabled and not force: return subject = six.text_type(DjangoTemplate(self.subject).render(Context(self.variables))) html = self.compile() text = html2text.html2text(html) self._send(subject, text, settings.HAPPYMAILER_FROM, recipient_list=self.recipients(), html_message=html, fail_silently=False)
def main(): speech.stop() if not appex.is_running_extension(): console.hud_alert('Reading clipboard') text = clipboard.get() url = None else: text = appex.get_text() url = appex.get_url() if url == None: try: url = [ mgroups[0] for mgroups in GRUBER_URLINTEXT_PAT.findall(text) ][0] except: pass if url != None: console.hud_alert('Reading: ' + url) h = html2text.HTML2Text() try: r = requests.get( url=url, headers={"User-agent": "Mozilla/5.0{0:06}".format(random.randrange(999999))}) except requests.ConnectionError as e: console.alert('Unable to connect to url.') return True html_content = r.text.decode('utf-8') text = html2text.html2text(html_content) else: console.hud_alert('Reading text: ' + str(text)) if text: speech.say(text) stop = console.alert('Done?', hide_cancel_button=True, button1='OK') speech.stop() else: console.hud_alert('No text found.')
def main(): if appex.is_running_extension(): url = appex.get_url() if url == None: text = appex.get_text() url = [ mgroups[0] for mgroups in GRUBER_URLINTEXT_PAT.findall(text) ][0] else: text = clipboard.get().strip() url = [ mgroups[0] for mgroups in GRUBER_URLINTEXT_PAT.findall(text) ][0] if not "http" in url: url = "http://" try: url = console.input_alert("URL", "", url) except: return True console.hud_alert('URL: %s' % url) h = html2text.HTML2Text() try: r = requests.get( url=url, headers={"User-agent": "Mozilla/5.0{0:06}".format(random.randrange(999999))} ) except Exception as e: raise(e.message) return True html_content = r.text.decode('utf-8') rendered_content = html2text.html2text(html_content) clipboard.set(rendered_content) launch_e = console.alert('Markdown copied to clipboard. Launch Evernote?', button1='Yes', button2='No', hide_cancel_button=True) if launch_e ==1: _eurl = "evernote://x-callback-url/new-note?type=clipboard&title=DRAFT&text=" app=UIApplication.sharedApplication() eurl=nsurl(_eurl) app.openURL_(eurl) appex.finish()
def main(): if appex.is_running_extension(): url = appex.get_url() else: url = clipboard.get().strip() if not RE_URL.match(url): try: url = console.input_alert("Enter gamefaqs URL", "", "https://www.gamefaqs.com/") except KeyboardInterrupt: sys.exit(0) newurl = "{0}?print=1".format(url) #baseurl = http://www.gamefaqs.com/ps3/959558-fallout-new-vegas/faqs/61226 if RE_URL.match(url): h = html2text.HTML2Text() r = requests.get( url=newurl, headers={"User-agent": "Mozilla/5.0{0:06}".format(random.randrange(999999))} ) html_content = r.text.decode('utf-8') rendered_content = html2text.html2text(html_content) filename = url.partition("gamefaqs.com/")[-1].partition("/")[-1].partition("/faqs")[0]+".txt" filepath = os.path.join(os.path.expanduser("~/Documents"), filename) with open(filepath, "w") as fo: fo.write(rendered_content) console.hud_alert('Success! Saved {0}'.format(filename), "success")
def get_site_text(url): resp = requests.get(url) resp.raise_for_status() html = resp.text return html2text.html2text(html) # 2: Score each word for an individual page against the full set of pages
def get_site_text(url): resp = requests.get(url) resp.raise_for_status() html = resp.text return html2text.html2text(html) #2: Import stopwords from an external file
def get_plain_text(self): action = html2text(smart_str(self.action)).rstrip() effect = html2text(smart_str(self.effect)).rstrip() setup = html2text(smart_str(self.setup)).rstrip() breakdown = html2text(smart_str(self.breakdown)).rstrip() return PlainText(action=action, setup=setup, effect=effect, breakdown=breakdown)
def process(input, entities): output = {} try: book_title = entities['book'][0]['value'] with requests_cache.enabled('book_cache', backend='sqlite', expire_after=86400): response = requests.get('https://www.goodreads.com/book/title.xml?key=' + GOODREADS_ACCESS_TOKEN + '&title=' + book_title) data = ElementTree.fromstring(response.content) book_node = data.find('book') author = book_node.find('authors').find('author').find('name').text title = book_node.find('title').text description = html2text(book_node.find('description').text) average_rating = book_node.find('average_rating').text link = book_node.find('link').text goodreads_attribution = '- Powered by Goodreads' template = TextTemplate() template.set_text('Title: ' + title + '\nAuthor: ' + author + '\nDescription: ' + description) template.set_post_text('\nAverage Rating: ' + average_rating + ' / 5' + '\n' + goodreads_attribution) text = template.get_text() template = ButtonTemplate(text) template.add_web_url('Goodreads Link', link) output['input'] = input output['output'] = template.get_message() output['success'] = True except: error_message = 'I couldn\'t find any book matching your query.' error_message += '\nPlease ask me something else, like:' error_message += '\n - book timeline' error_message += '\n - harry potter book plot' error_message += '\n - little women book rating' output['error_msg'] = TextTemplate(error_message).get_message() output['success'] = False return output
def get_bot_define_response(self, original_content: str) -> str: split_content = original_content.split(' ') # If there are more than one word (a phrase) if len(split_content) > 1: return DefineHandler.PHRASE_ERROR_MESSAGE to_define = split_content[0].strip() to_define_lower = to_define.lower() # Check for presence of non-letters non_letters = set(to_define_lower) - set(string.ascii_lowercase) if len(non_letters): return self.SYMBOLS_PRESENT_ERROR_MESSAGE # No word was entered. if not to_define_lower: return self.EMPTY_WORD_REQUEST_ERROR_MESSAGE else: response = '**{}**:\n'.format(to_define) try: # Use OwlBot API to fetch definition. api_result = requests.get(self.DEFINITION_API_URL.format(to_define_lower)) # Convert API result from string to JSON format. definitions = api_result.json() # Could not fetch definitions for the given word. if not definitions: response += self.REQUEST_ERROR_MESSAGE else: # Definitions available. # Show definitions line by line. for d in definitions: example = d['example'] if d['example'] else '*No example available.*' response += '\n' + '* (**{}**) {}\n {}'.format(d['type'], d['defenition'], html2text.html2text(example)) except Exception as e: response += self.REQUEST_ERROR_MESSAGE logging.exception("") return response
def review_notification(email_url, user_email, context, mail, copy_to_staffs=False, copy_to_gatekeeper=False): # pylint: disable=too-many-arguments """Compose the message and send the email.""" if config.CLAIMANT_EMAIL_NOTIFICATION and email_url is not None: # Generate message flatemail = FlatPage.objects.get(url=email_url) template = Template(flatemail.content) context.update({ "notes": mail.justification, "protocol": "https", "site": Site.objects.get(id=SITE_ID), "FELLOWS_MANAGEMENT_EMAIL": config.FELLOWS_MANAGEMENT_EMAIL, }) context = Context(context) html = template.render(context) plain_text = html2text(html) mail.justification = plain_text # Email to claimant msg = EmailMultiAlternatives( flatemail.title, plain_text, mail.sender.email, user_email, cc=[config.WEBSITE_GATEKEEPER_EMAIL] if copy_to_gatekeeper else None, bcc=ast.literal_eval(config.STAFFS_EMAIL) if copy_to_staffs else None, reply_to=[config.FELLOWS_MANAGEMENT_EMAIL] ) msg.attach_alternative(html, "text/html") msg.send(fail_silently=False) # Every email is archived in the database mail.save()
def update_check(self): await self.bot.wait_until_ready() self.bot.logger.debug("Started GAF Steam Announcement RSS Update Check Loop") while not self.bot.is_closed(): response, _, code = await net.get_url("http://steamcommunity.com/groups/TheNeverEndingGAF/rss/") xml = await response.read() root = etree.fromstring(xml) last_pub = dateparser.parse(self.bot.config["pub_dates"]["gaf"]) new_posts = [] for element in root.xpath("//item"): post_pub = dateparser.parse(element[3].text) if post_pub > last_pub: new_posts.append(element) # Iterate over new posts for i, p in reversed(list(enumerate(new_posts))): # Update date if it's the newest post. Should be last elemen iterated through if i == 0: self.bot.config["pub_dates"]["gaf"] = p[3].text await self.bot.update_config() self.bot.logger.debug("Updated GAF pub date") # Post to guilds for guild in self.bot.guilds: guild_config = await self.bot.get_guild_config(guild.id) if guild_config["feeds"]["gaf"]["enabled"]: channel = discord.utils.get(guild.channels, id=guild_config["feeds"]["gaf"]["channel"]) with channel.typing(): if len(html2text.html2text(p.find("description").text)) > 1900: content = html2text.html2text(p.find("description").text[:1900]) + ". . ." else: content = html2text.html2text(p.find("description").text) embed = discord.Embed( title="{}".format(p.find("title").text), colour=discord.Colour.gold(), url="{}".format(p.find("link").text), timestamp=dateparser.parse(p[3].text), description=content ) embed.set_thumbnail(url="http://www.neverendinggaf.com/graphics/logos/gaf-logo.jpg") embed.set_footer(text="Author - {}".format(p.find("author").text)) if "@everyone" in content: message_content = "**New Announcement** - Content Below @everyone" else: message_content = "**New Announcement** - Content Below" message_content += "\n*Author* : {}".format(p.find("author").text) await channel.send(content=message_content, embed=embed) self.bot.logger.debug(f"Sent new GAF Steam Announcement to guild {guild} channel {channel}") await asyncio.sleep(60)
def new_notification(staff_url, email_url, user_email, context, mail): if config.STAFF_EMAIL_NOTIFICATION: # Email to staff context.update({ "protocol": "https", "site": Site.objects.get(id=SITE_ID), "FELLOWS_MANAGEMENT_EMAIL": config.FELLOWS_MANAGEMENT_EMAIL, }) flatemail = FlatPage.objects.get(url=staff_url) template = Template(flatemail.content) jinja_context = Context(context) html = template.render(jinja_context) plain_text = html2text(html) mail_staffs( flatemail.title, plain_text, html_message=html, fail_silently=False ) if config.CLAIMANT_EMAIL_NOTIFICATION: # Email to claimant context.update({ "protocol": "https", "site": Site.objects.get(id=SITE_ID), "FELLOWS_MANAGEMENT_EMAIL": config.FELLOWS_MANAGEMENT_EMAIL, }) flatemail = FlatPage.objects.get(url=email_url) template = Template(flatemail.content) jinja_context = Context(context) html = template.render(jinja_context) plain_text = html2text(html) msg = EmailMultiAlternatives( flatemail.title, plain_text, DEFAULT_FROM_EMAIL, user_email, reply_to=[config.FELLOWS_MANAGEMENT_EMAIL] ) msg.attach_alternative(html, "text/html") msg.send(fail_silently=False) mail.justification = plain_text mail.save()