def process_post_bodies(bodies: List[Tag]) -> (str, list): for body in bodies: cites = list() cited = body.findAll('div', {'class': 'cite'}) if cited: cites = [c['name'] for c in cited] collect_text = [] for tag in body: # TODO: This is a suboptimal(and partially wrong) solution to parse cites in post body (a lot to improve here) if tag.name not in ('div', 'p'): if hasattr(tag, 'text'): collect_text.append(tag.text) elif isinstance(tag, NavigableString): collect_text.append(str(tag)) else: collect_text.append('\n') else: yield ''.join(collect_text), cites
def checklistInENMLtoSoup(soup): ''' Transforms Evernote checklist elements to github `* [ ]` task list style ''' transform_tags = ['p','div'] # soup.select cant be used with dashes: https://bugs.launchpad.net/beautifulsoup/+bug/1276211 for todo in soup.find_all('en-todo'): parent = todo.parent transform = parent.find() == todo and parent.name in transform_tags checked = todo.attrs.get('checked',None) == "true" todo.replace_with("[x] " if checked else "[ ] ") # EN checklist can appear anywhere, but if they appear at the beggining # of a block element, transform it so it ressembles github markdown syntax if transform: content = ''.join(unicode(child) for child in parent.children if isinstance(child, NavigableString) ).strip() new_tag = soup.new_tag("li") new_tag.string = content parent.replace_with(new_tag)
def process_tag(tag, valid_tags=()): if isinstance(tag, NavigableString): return tag if tag.name in valid_tags: for subtag in tag.contents: subtag.replaceWith(process_tag(subtag, valid_tags)) return tag else: result = "" for subtag in tag.contents: result += str(process_tag(subtag, valid_tags)) return result
def get_students(self): group = self.group group_name = group[:group.find('(')].strip() group_code = group[group.find('(')+1:group.find(')')] students = [] for row in self.table.children: if type(row) == NavigableString: continue active = True link = row.find(class_='fio_3').parent if link.has_attr('style') and link['style'] == 'color:gray;': #????? ?????? - ??????? ???????? active = False student_id = parse_qs(urlparse(link['href']).query)['sid'][0] name = row.find(class_='fio_3').string.strip() record_book_id = row.find(class_='hc3').string.strip() name = " ".join(name.split()) record_book_id = " ".join(record_book_id.split()) students.append({'name': name, 'id': student_id, 'record_book': record_book_id, 'active': int(active)}) return {'group': group_name, 'code': group_code, 'students': students, 'id': self.group_id}
def parse(movie): url = PAGE_URL % movie.id r = requests.get(url) soup = BeautifulSoup(r.text.encode('utf-8'), 'lxml') movie.score = soup.find('strong', 'rating_num').text info = soup.find('div', {'id': 'info'}) for linebreak in info.find_all('br'): linebreak.extract() for span in info.contents: if isinstance(span, NavigableString): continue if span.contents[0]: if span.contents[0].string == u'??': if isinstance(span.contents[1], NavigableString): movie.director = span.contents[2].text elif span.contents[0].string == u'??': if isinstance(span.contents[1], NavigableString): movie.actor = span.contents[2].text print movie
def parse_character_results(soup): """ Parse a page of character results. :param soup: The BS4 class object :return: Returns a list of dictionaries containing a name, gender and list of dictionaries containing a game name/id pair for games they appeared in. """ soup = list(soup.find_all('table', class_='stripe')[0].children)[1:] characters = [] for item in soup: temp_c = {'gender': None, 'name': None, 'games': {}} temp_c['gender'] = item.abbr.get('title') temp_c['name'] = list(item.children)[1].a.string temp_c['games'] = [] for game in list(list(list(item.children)[1].children)[1].children): if isinstance(game, NavigableString): continue temp_c['games'].append({'name': game.string, 'id': game.get('href').split('/')[1]}) characters.append(temp_c) del temp_c return characters
def print_content(contents): for content in contents: name = content.name #if not isinstance(content, Tag): if isinstance(content, NavigableString): s = str(content) s = s.replace("\n","") print s.strip() else: if name == "img": ''' img = content.find("img") if img: print img.get("src") ''' print "[??]" elif name == "br": print "" elif name == "noscript": continue elif name == "li": print "•", print_content(content.contents)
def get_detail(self, host_soup, vul_summary): ''' host report -> section 2.2: vulnerability detail, return dict ''' name_detail_lst = host_soup.find('div', id='vul_detail').table.contents same_vuls = [] for i in name_detail_lst: if type(i) is NavigableString: continue if i.span: name = i.span.string for name_port in vul_summary: if name in name_port: same_vuls.append(name_port) elif same_vuls: # in case of repeat vulnerability but differ port lst_solu = self.get_solution(i) for name_port in same_vuls: lst = vul_summary.get(name_port) if lst and (len(lst) == 5): vul_summary[name_port].extend(lst_solu) same_vuls = [] return vul_summary
def get_solution(self, tag): '''['????', '????', 'CVE??'] ''' value = [] tr_lst = tag.table.contents for i in tr_lst: if type(i) is NavigableString: continue if i.th.string in (u'????', u'????'): val = [i.strip() for i in i.td.strings] val = '\n'.join(val).replace('\n*', '*') value.append(val) elif i.th.string == u'CVE??': value.append(i.td.string) if len(value) == 2: value.append(None) return value
def __get_navigable_strings(self,soup): if isinstance(soup, NavigableString): if type(soup) not in (Comment, Declaration) and soup.strip(): yield soup elif soup.name not in ('script', 'style'): for c in soup.contents: for g in self.__get_navigable_strings(c): yield g
def parse_aiml_text(text): text = '<p>' + text + '</p>' soup = BeautifulSoup(text, 'lxml') tokens = [] try: for c in soup.p.children: if isinstance(c, NavigableString): token = c.string.strip() if token: tokens.append(token) except Exception as ex: logger.warn(ex) return text return ' '.join(tokens)
def get_first_text(soup, strip = False, types = (NavigableString, CData)): data = None for s in soup._all_strings(strip, types = types): data = s break return data
def get_texts(soup, strip = False, types = (NavigableString, CData)): texts = [] for s in soup._all_strings(strip, types = types): texts.append(s) return texts
def html_to_text(html): "Creates a formatted text email message from a rendered html template (page)" soup = BeautifulSoup(html, 'html.parser') # Ignore anything in head body, text = soup.body, [] if body is None: return "" else: for element in body.descendants: # We use type and not isinstance since comments, cdata, etc are subclasses that we don't want if type(element) == NavigableString: # We use the assumption that other tags can't be inside a script or style if element.parent.name in ('script', 'style'): continue elif element.parent.name == 'a': # replace link text with the link #text += [element.parent['href']] continue # remove any multiple and leading/trailing whitespace string = ' '.join(element.string.split()) if string: if element.parent.name == 'p': # Add extra paragraph formatting newline string = '\n' + string text += [string] doc = '\n'.join(text) #.encode('utf-8') return doc
def parse_notes(self): notes = [] #??????????? ????????? ???? ?????? ??????? ??? ????? for tag in self.page.find_all(class_='div-comment'): sibling = tag.next_sibling.next_sibling if type(tag.next_sibling) is NavigableString else tag.next_sibling if sibling and not (sibling.has_attr('class') and 'div-control' in sibling['class']): note = tag.get_text() if note.startswith('?????(?):'): notes.append({'name':'authors', 'value':note[10:].strip()}) elif note.startswith('??????????:'): notes.append({'name':'comment', 'value':note[12:].strip()}) else: raise NotImplementedError('??????????? ????? ? ??????????: {}'.format(note)) return notes
def parse_description(self, tag_id): description_span = self.page.find(id=tag_id) description = [] for discipline_property in description_span.find_all(class_='div-comment'): property_name = discipline_property.string.strip() sibling = discipline_property.next_sibling.next_sibling if type(discipline_property.next_sibling) \ is NavigableString else discipline_property.next_sibling property_value = sibling.string property_value = property_value.strip() if property_value else '' if property_value: description.append({'name':property_name, 'value':property_value}) return description
def strip_tags(html, invalid_tags): soup = BeautifulSoup(html,"html.parser") coref_id_set=set() set2text={} for tag in soup.findAll(True): if tag.name in invalid_tags: s = "" for c in tag.contents: if not isinstance(c, NavigableString): c = strip_tags(unicode(c), invalid_tags) s += unicode(c) tag.replaceWith(s) for t in soup.find_all("coref"): if t['set-id'] in coref_id_set : pronoun_regex = re.compile('|'.join(pronouns)) # print t.get_text(), if len(pronouns.intersection(nltk.word_tokenize(t.get_text().lower()))) > 0: # print t.get_text(), t.replaceWith(set2text[t['set-id']]) # print "REPLACED WITH :" , set2text[t['set-id']] else: coref_id_set.add(t['set-id']) set2text[t['set-id']]=t.get_text() # print soup soup = re.sub("(\\t|\\r?\\n)+", " ",str(soup)) soup = re.sub("</s><s>","\n",soup) soup = re.sub('<[^>]*>', '', soup) return soup
def get_parrafos(soup): prfs= soup.find_all(['li','table']) ps = soup.find_all('p') for p in ps: if not p.span: prfs.append(p) continue flag=False for c in p.contents: if ((isinstance(c, bs4.NavigableString) or isinstance(c, unicode)) and not is_vacio(c)): flag=True break if flag: prfs.append(p) return prfs
def eqsibling(n): r=[] tag=n.name s=n.next_sibling while s: if (isinstance(s, bs4.NavigableString) or isinstance(s, unicode)): if not is_vacio(s): break elif s.name!=tag or not eqclass(s,n): break r.append(s) s=s.next_sibling return r
def search_esv(message, verse): """ Search for a bible passage from the English Standard Version. Example:: bible Romans 12:16 """ r = await http.get("http://www.esvapi.org/v2/rest/passageQuery", params={ "key": "IP", "passage": verse, "output-format": "crossway-xml-1.0", "include-simple-entities": "true", }) doc = BeautifulSoup(r.text(), features="lxml") if not doc.passage: raise CommandError("Verse not found.") lines = [] for verse_unit in doc.passage.content.find_all('verse-unit'): num = int(verse_unit.find('verse-num').text) woc = verse_unit.find('woc') if woc: text = woc.text else: text = "".join([str(node) for node in verse_unit.children if isinstance(node, NavigableString) and not isinstance(node, Comment)]) lines.append("**{}** {}".format(num, text.strip())) return "\n".join(lines)
def UNHDR_scrape_description(): #the final object will be a dictionary with indicator name as the key and desctiption as content may_contain_indicators=[] clean_listed_indicators={} urls = ['http://hdr.undp.org/en/composite/HDI', 'http://hdr.undp.org/en/composite/IHDI', 'http://hdr.undp.org/en/composite/trends', 'http://hdr.undp.org/en/composite/GDI', 'http://hdr.undp.org/en/composite/GII', 'http://hdr.undp.org/en/composite/MPI',] for url in urls: url_response_raw = rq.get(url) BS = BeautifulSoup(url_response_raw.text, "lxml") p_elements = BS.find_all('p') p_contents = [] for e in p_elements: p_contents.append(e) for paragraph in p_contents: if not isinstance(paragraph,NavigableString): if 'Definitions' in paragraph.text: may_contain_indicators.append(paragraph) for paragraf in may_contain_indicators: if ':' in paragraf.text: with_colons_added = paragraf.get_text('::') dub_colon_as_list = [] for i in enumerate(with_colons_added.split('::')): dub_colon_as_list.append(i) for i,string in dub_colon_as_list: if ': ' in string: indicator_name_full=str(unicodedata.normalize('NFKD',dub_colon_as_list[i-1][1]).encode('ascii', 'ignore')).strip('\n') indicator_name_abridged=indicator_name_full[:indicator_name_full.find(':')] description=str(unicodedata.normalize('NFKD',dub_colon_as_list[i][1]).encode('ascii', 'ignore')).strip('\n') if i+1<len(dub_colon_as_list) and 'http' in dub_colon_as_list[i+1][1]: details_link=dub_colon_as_list[i+1][1] else: details_link ='no further link provided for this indicator' print 'adding %s %s %s' % (indicator_name_abridged, description, details_link) clean_listed_indicators[indicator_name_abridged]=[description,details_link] return clean_listed_indicators
def get_summary(self, host_soup): ''' host report -> section 1: host summary, return list ''' result = [] condition = (u'IP??', u'????') p = host_soup.find('tr', class_='even').parent for i in p.contents: if type(i) is NavigableString: continue elif i.th.string in condition: result.append(i.td.string) if len(result) < 2: result.append(None) return result
def checklistInSoupToENML(soup): ''' Transforms github style checklists `* [ ]` in the BeautifulSoup tree to enml. ''' checktodo_re = re.compile(r'\[(.)\]') # To be more github compatible, if in a list all elements begins with `[ ]`` # transform it to normal `[ ]` evernote elements for ul in soup.find_all('ul'): tasks = []; istodo = True for li in ul.find_all('li'): task = soup.new_tag('div') todo_tag = soup.new_tag('en-todo') reg = checktodo_re.match(li.get_text()) istodo = istodo and reg character = reg.group(1) if reg else None if character == "x": todo_tag['checked']="true" task.append(todo_tag) if reg: task.append(NavigableString(li.get_text()[3:].strip())) tasks.append(task) if istodo: for task in tasks: ul.insert_after(task) ul.extract() # For the rest of elements just replace `[ ]` with the appropriate element for todo in soup.find_all(text=checktodo_re): str_re = re.match(r'(.*)\[(.)\](.*)',todo) pre = str_re.group(1) post = str_re.group(3) todo_tag = soup.new_tag('en-todo') if str_re.group(2) == "x": todo_tag['checked']="true" todo.replace_with(todo_tag) todo_tag.insert_before(pre) todo_tag.insert_after(post)
def check_all_pages(target=None): """Reads all pages for a target and checks them for style.""" target = dactyl_build.get_target(target) pages = dactyl_build.get_pages(target) pp_env = dactyl_build.setup_pp_env() print("Style Checker - checking all pages in target %s" % target["name"]) style_issues = [] for page in pages: if "md" not in page: # Not a doc page, move on continue logger.info("Checking page %s" % page["name"]) page_issues = [] html = dactyl_build.parse_markdown(page, pages=pages, target=target) soup = BeautifulSoup(html, "html.parser") overrides = get_overrides(soup) content_elements = ["p","li","a","em","strong","th","td", "h1","h2","h3","h4","h5","h6"] for el in soup.descendants: if (type(el) == NavigableString and el.parent.name in content_elements and str(el).strip()): passage = str(el).strip() passage_issues = check_passage(passage, overrides) if passage_issues: page_issues += passage_issues #print("'%s' (%s)" % (el, el.parent.name)) # for el in soup.find_all(content_elements): # for passage in el.stripped_strings: # passage_issues = check_passage(passage, overrides) # if passage_issues: # page_issues += passage_issues if page_issues: style_issues.append( (page["name"], page_issues) ) return style_issues