我们从Python开源项目中,提取了以下46个代码示例,用于说明如何使用bs4.Tag()。
def parser_day_bangumi(soup): """ :param soup: :type soup: bs4.Tag :return: list :rtype: list[dict] """ li = [] for soup in soup.find_all('li'): url = soup.select_one('a') span = soup.find('span') if url: name = url['title'] url = url['href'] assert isinstance(url, str) bangumi_id = url.split('/')[-1] soup.find('li', ) li.append({'name': name, 'keyword': bangumi_id, 'cover': span['data-src']}) return li
def process_post_bodies(bodies: List[Tag]) -> (str, list): for body in bodies: cites = list() cited = body.findAll('div', {'class': 'cite'}) if cited: cites = [c['name'] for c in cited] collect_text = [] for tag in body: # TODO: This is a suboptimal(and partially wrong) solution to parse cites in post body (a lot to improve here) if tag.name not in ('div', 'p'): if hasattr(tag, 'text'): collect_text.append(tag.text) elif isinstance(tag, NavigableString): collect_text.append(str(tag)) else: collect_text.append('\n') else: yield ''.join(collect_text), cites
def append_to(parent, tag, **kwargs): """ Append an element to the supplied parent. :param parent: Parent to append to. :param tag: Tag to create. :param args: Tag args. :param kwargs: Tag kwargs. :return: New element. """ if hasattr(parent, "soup"): soup = parent.soup else: soup = parent.find_parent("html") # Create Tag explicitly instead of using new_tag, otherwise attribute "name" leads to clash with tag-name in bs4 new_tag = bs4.Tag(builder=soup.builder, name=tag, attrs=kwargs) new_tag.soup = soup parent.append(new_tag) return new_tag
def read_component(thing): if isinstance(thing, Tag): if thing.name == "em": return "*" + read_component(thing.next_element) + "*" elif thing.name == "strong": return "**" + read_component(thing.next_element) + "**" elif thing.name == "u": return "__" + read_component(thing.next_element) + "__" elif thing.attrs.get("style") == "text-decoration: line-through;": return "~~" + read_component(thing.next_element) + "~~" elif thing.attrs.get("id") is not None and "footnoteref" in \ thing.attrs["id"]: return "" else: return read_component(thing.next_element) else: return thing
def show_weather(cityinfo): print(u'?????? #%s,%s# ???...' % (cityinfo.get(u'parent_name_ch'), cityinfo.get(u'city_name_ch'))) weather_content = api.getWeather(cityinfo.get(u'id')) soup = BeautifulSoup(weather_content, u'html.parser') # print(soup.prettify()) # print(soup.title) table_tag = soup.find_all(u'table', class_=u'sevendays')[0] for child in table_tag.children: if not isinstance(child, Tag): continue date = child.find(u'td', class_=u'date').get_text() temp = child.find(u'td', class_=u'temp').get_text() desc = child.find(u'td', class_=u'desc').get_text() print(''.join(date.split())) print(''.join(temp.split())) print(''.join(desc.split())) print(u'=================')
def _showWeather(self, city): self.info.insert(tk.INSERT, u'?????? #%s, %s# ???...\n\n\n' % ( city.get(u'city_name_ch'), city.get(u'parent_name_ch'))) weather_content = self.api.getWeather(city.get(u'id')) soup = BeautifulSoup(weather_content, u'html.parser') table_tag = soup.find_all(u'table', class_=u'sevendays')[0] for child in table_tag.children: if not isinstance(child, Tag): continue date = child.find(u'td', class_=u'date').get_text() temp = child.find(u'td', class_=u'temp').get_text() desc = child.find(u'td', class_=u'desc').get_text() self.info.insert(tk.INSERT, ''.join(date.split()) + '\n') self.info.insert(tk.INSERT, ''.join(temp.split()) + '\n') self.info.insert(tk.INSERT, ''.join(desc.split()) + '\n') self.info.insert(tk.INSERT, u'=================' + '\n')
def get_movie_list(kw_movie, pageIndex=0): url = api_movies.format(movie=kw_movie, page_index=pageIndex) html = fetch_text(url) dom = BeautifulSoup(html, 'html.parser') try: # 1.movie div_items = dom.find_all('div', 'item prel clearfix') # type:Tag movies = [] for div in div_items: movie = process_movie_item(div) movies.append(movie) # 2.page next div_page = dom.find('div', 'pagination l clearfix') index, haveNext = process_page_next(div_page) page = PageMovie(movies, index, haveNext) return Resp(page) except Exception as e: return Resp(errorMsg=e.__repr__())
def process_movie_item(div_item: Tag) -> Movie: movie = Movie() # ---------------- div1 = div_item.find('div', 'litpic hidden-xs hidden-sm') a = div1.findChild() # detail_url movie.detail_url = base_url + a['href'] # avatar img = a.findChild() movie.avatar_url = img['data-original'] # --------------- div2 = div_item.find('div', 'title') # type:Tag b = div2.select("p a b")[0] # type:Tag movie.name = b.text return movie
def get_MovieList(keyword: str) -> List[Movie]: ''' ????????????? :param keyword: :return: ''' r = requests.get(base_url + '/search?ad=1&q={0}'.format(keyword)) dom = BeautifulSoup(r.text, 'html.parser') list_movie = [] div_blocks = dom.find_all('div', class_='item prel clearfix') try: for div_block in div_blocks: # type:Tag movie = get_Movie(div_block) if movie: list_movie.append(movie) except BaseException: pass return list_movie
def get_Movie(item: Tag) -> Movie: ''' ?????? :param item: :return: ''' try: movie = Movie() a = item.select_one('div.title p a') # type:Tag movie.detail_url = a['href'] movie.name = a.findChild().text except BaseException: pass return movie
def get_ZimusByMovie(url: str) -> List[Zimu]: r = requests.get(base_url + "/" + url) dom = BeautifulSoup(r.text, 'html.parser') list_zimu = [] father = dom.select_one('body tbody') # type: Tag trs = father.select('tr') # type:List[Tag] for tr in trs: try: a = tr.select_one('td a') zimu = Zimu() zimu.detail_url = a['href'] zimu.name = a['title'] list_zimu.append(zimu) except BaseException: continue return list_zimu
def naver_complete_login(request, app, token): provider = providers.registry.by_id(NaverProvider.id) headers = {'authorization': 'Bearer {}'.format(token.token)} resp = requests.get(API_URL + '/nid/getUserProfile.xml', headers=headers) resp.raise_for_status() soup = BeautifulSoup(resp.text, 'xml') parsed = {} for sub in ('result', 'response'): props = {} for tag in soup.find(sub): if isinstance(tag, Tag): props[tag.name] = tag.text parsed[sub] = props extra_data = parsed['response'] login = provider.sociallogin_from_response(request, extra_data) return login
def parse_translation_table(self, table): """ Overrides GeneralParser's method. :param table: a Tag object. Not necessary a table; can be a div. :return: (translation, language_name, language_code) """ # go through all "li" elements in a table for li in table.find_all('li'): if not isinstance(li, Tag): continue text = li.get_text().split(':') if len(text) < 2: continue # language name is before ":" lang_name = text[0] # language code is usually in super script lang_code = li.find(class_="trad-sup-code") if lang_code: lang_code = lang_code.text.strip()[1:-1] else: lang_code = "" # There are two functions that removes parentheses. Not sure which one to use. t = remove_parenthesis(text[1]) trans_list = re.split(COMMA_OR_SEMICOLON, t) # each "trans" is: translation <sup>(lang_code)</sup> (transliteration) # lang_code and transliteration may not exist for trans in trans_list: translation = trans.split('(')[0].strip() yield (translation, lang_name.strip(), lang_code)
def parse_translation_table(self, table): """ Parse the table to get translations and the languages. Hopefully this function will work for most editions. Override this method if needed. :param table: a Tag object. Not necessary a table; can be a div. :return: (translation, language_name, language_code) """ for li in table.find_all('li'): if not isinstance(li, Tag): continue text = li.get_text().split(':') # TBD: the table is not a translation table # OR the table is a translation table but there are some <li> without colon if len(text) < 2: continue # language name is before ":" lang_name = text[0].strip() # language code is in super script lang_code = li.find("sup") if lang_code: lang_code = remove_all_punctuation(lang_code.text).strip() else: lang_code = "" t = remove_parenthesis(text[1]) trans_list = re.split(COMMA_OR_SEMICOLON, t) # each "trans" is: translation <sup>(lang_code)</sup> (transliteration) # lang_code and transliteration may not exist for trans in trans_list: # translation = trans.split('(')[0].strip() translation = re.split(r'[(??]', trans)[0].strip() # Throw out tuples if they have '[[' if "[[" in translation: continue yield (translation, lang_name, lang_code)
def parse_unordered_list_polish(self, ulist): for li in ulist.find_all('li'): if not isinstance(li, Tag): continue if not li.get_text() == '': text = li.get_text().split(':') lang_name = text[0] lang_code = '' if len(text) > 1: trans_list = re.split(COMMA_OR_SEMICOLON, text[1]) for trans in trans_list: translation = remove_parenthesis(trans).strip() yield (translation, lang_name, lang_code)
def parse_translation_table_russian(self, table): for li in table.find_all('li'): if not isinstance(li, Tag): continue text = li.get_text().split(':') # language name is before ":" lang_name = text[0] lang_code = '' if li.find("sub"): lang_code = li.find("sub").get_text() # remove the lang code from the lang name lang_name = lang_name[:-len(lang_code)] if len(text) > 1: t = remove_parenthesis(text[1]) else: t = remove_parenthesis(text[0]) trans_list = re.split(COMMA_OR_SEMICOLON, t) for trans in trans_list: translation = trans.split('(')[0].strip() if not translation == '': yield (translation, lang_name, lang_code)
def bs_tag_to_string(bstag: Tag) -> str: return ''.join(str(item) for item in bstag.contents)
def search_by_keyword(self, keyword, count=None): """ return a list of dict with at least 4 key: download, name, title, episode example:
[ { 'name':"?????????", 'download': 'magnet:?xt=urn:btih:what ever', 'title': "[????] ????????? ?12? MP4 720p ?", 'episode': 12 }, ] ``` :param keyword: search key word :type keyword: str :param count: how many page to fetch from website :type count: int :return: list of episode search result :rtype: list[dict] """ result = [] r = network.get(server_root + "Home/Search", params={'searchstr': keyword}).text s = BeautifulSoup(r, 'lxml') td_list = s.find_all('tr', attrs={'class': 'js-search-results-row'}) # type:list[bs4.Tag] for tr in td_list: title = tr.find('a', class_='magnet-link-wrap').text time_string = tr.find_all('td')[2].string result.append({ 'download': tr.find('a', class_='magnet-link').attrs.get('data-clipboard-text', ''), 'name': keyword, 'title': title, 'episode': self.parse_episode(title), 'time': int(time.mktime(time.strptime(time_string, "%Y/%m/%d %H:%M"))) }) # print(result) return result
```
def author(self): """The author of this work.""" # The author of the work is kept in the byline, in the form # # <h3 class="byline heading"> # <a href="/users/[author_name]" rel="author">[author_name]</a> # </h3> # byline_tag = self._soup.find('h3', attrs={'class': 'byline'}) a_tag = [t for t in byline_tag.contents if isinstance(t, Tag)] assert len(a_tag) == 1 return a_tag[0].contents[0].strip()
def isTagClass(obj): return isinstance(obj, Tag)
def getelementlistwithlabel(tagObj, label, options={}): if isinstance(tagObj, Tag): elementlist = [] templist = tagObj.find_all(label, attrs=options) elementlist.extend(templist) return elementlist else: print '??????,??Tag?? ????:' + tagObj return None
def gettextlistwithlabel(tagObj): if isinstance(tagObj, Tag): strlist = tagObj.get_text() return strlist.encode('utf-8') else: print '??????,??Tag?? ????:' + tagObj return None
def _parse_sample_tag(self, tag): assert isinstance(tag, bs4.Tag) assert tag.name == 'pre' prv = utils.previous_sibling_tag(tag) pprv = tag.parent and utils.previous_sibling_tag(tag.parent) if prv.name == 'h6' and tag.parent.name == 'div' and tag.parent['class'] == ['paragraph'] and pprv.name == 'h5': log.debug('h6: %s', str(prv)) log.debug('name.encode(): %s', prv.string.encode()) s = tag.string or '' # tag.string for the tag "<pre></pre>" returns None return utils.textfile(s.lstrip()), pprv.string + ' ' + prv.string
def previous_sibling_tag(tag): tag = tag.previous_sibling while tag and not isinstance(tag, bs4.Tag): tag = tag.previous_sibling return tag
def next_sibling_tag(tag): tag = tag.next_sibling while tag and not isinstance(tag, bs4.Tag): tag = tag.next_sibling return tag
def __init__(self, form, url): assert isinstance(form, bs4.Tag) assert form.name == 'form' self.form = form self.url = url self.payload = {} self.files = {} for input in self.form.find_all('input'): log.debug('input: %s', str(input)) if input.attrs.get('type') in [ 'checkbox', 'radio' ]: continue if 'name' in input.attrs and 'value' in input.attrs: self.payload[input['name']] = input['value']
def _parse_sample_tag(self, tag): assert isinstance(tag, bs4.Tag) assert tag.name == 'h2' name = tag.contents[0] if ':' in name: name = name[: name.find(':') ] if name in [ 'Sample input', 'Sample output' ]: nxt = tag.next_sibling while nxt and nxt.string.strip() == '': nxt = nxt.next_sibling if nxt.name == 'pre': s = utils.textfile(utils.dos2unix(nxt.string.lstrip())) else: s = '' return s, name
def soupify(self, body): # https://www.crummy.com/software/BeautifulSoup/ # docs: https://www.crummy.com/software/BeautifulSoup/bs4/doc/ # bs4 codebase: http://bazaar.launchpad.net/~leonardr/beautifulsoup/bs4/files if isinstance(body, Tag): return body soup = BeautifulSoup(body, "html.parser") return soup
def parse_tasks(tasks, year): db = [] for task in tasks: if not isinstance(task, bs4.Tag): continue task = task.td base_name = task.strong.text.strip() next_tag = task.strong.next_sibling.next_sibling.name if next_tag == 'span': name = base_name maximum = task.span.text.strip() results = parse_results(task.table, year) db.append({'category': 'common', 'name': name, 'max': int(maximum), 'students': results, 'year': year}) else: for st in task.findAll('font'): if st.previous.name != 'div': continue name = st.text.strip() category = base_name maximum = st.findNext('span').text.strip() results = parse_results(st.findNext('table'), year) db.append({'category': category, 'name': name, 'max': int(maximum), 'students': results, 'year': year}) return db
def is_leaf_table(table_soup): if not isinstance(table_soup,Tag): return True if len(table_soup.find_all('table')) == 0: return True return False
def is_leaf_td(td_soup): if not isinstance(td_soup,Tag): return True if td_soup.table == None: return True return False
def is_single_head(tr_soup): if not isinstance(tr_soup,Tag): return True if tr_soup.th.nextSibling.name != 'th': return True return False
def extract_txt(tag): return [el.get_text().strip() if is_leaf_td(el) else \ get_deep_table(el.table) for el in tag if isinstance(el, Tag)]
def fetch_level(element, limit=1024): length = 0 parts = [] if element is None: return "[DATA ERROR]" for thing in [element] + list(element.next_siblings): # component = read_component(thing) if isinstance(thing, Tag): if thing.name == "em": component = "*" + fetch_level(thing.next_element) + "*" elif thing.name == "strong": component = "**" + fetch_level(thing.next_element) + "**" elif thing.name == "u": component = "__" + fetch_level(thing.next_element) + "__" elif thing.attrs.get("style") == "text-decoration: line-through;": component = "~~" + fetch_level(thing.next_element) + "~~" elif thing.attrs.get("id") is not None and "footnoteref" in \ thing.attrs["id"]: return "" else: component = fetch_level(thing.next_element) else: component = thing if component: length += len(component) if length > limit - 3: if not component.endswith(".") or length > limit: break else: parts.append(component) if len(parts) == 0: return "[WITHHELD]" return "".join(parts).strip("-:, ")
def first_starts_in(div): if len(div.contents)==0: return False f=div.contents[0] txt=None if isinstance(f, bs4.Tag): if f.name=="h3": return False txt=f.get_text() else: txt=f.string return not starts_in(txt,ini1)
def sclean(txt): if isinstance(txt, bs4.Tag): txt=txt.get_text() txt=filter(lambda x: x in printable, txt) txt=sp.sub("",txt).strip() return txt
def get_spbr(soup): pcmd=[] for p in soup.findAll("p"): if len(p.contents)>2: c1=p.contents[0] c2=p.contents[1] c3=p.contents[2] if isinstance(c1, bs4.Tag) and isinstance(c2, bs4.Tag): if c1.name=="span" and c2.name=="br": if not isinstance(c3, bs4.Tag) or c3.name!="span" or c1.attrs["class"]!=c3.attrs["class"]: pcmd.append(p) return pcmd
def generate_translation_tuples(self, soup): """ A generator of translation tuples :param soup: BeautifulSoup object :return: tuple of the form (edition, headword, head_lang, translation, trans_lang, trans_lang_code, part_of_speech) """ # START non-edition-specific # this is the table of content which is present in each edition toc = soup.find('div', id='mw-content-text') page_state = {'headword': None, 'headword_lang': None, 'part_of_speech': ''} pronounce = '' headword_element = soup.find('h1', id='titleHeading') if headword_element is not None: page_state['headword'] = headword_element.text for element in toc.children: if isinstance(element, Tag): # it could be a Tag or a NavigableString level = self.get_heading_level(element.name) # END non-edition-specific # Find the headword language if level == 2 and 'id' in element.attrs and element['id'] == 'mwAQ': page_state['headword_lang'] = element.text.replace('dili','').strip() pronounce = '' elif level == 3 and 'id' in element.attrs and element['id'] == 'mwAw': page_state['part_of_speech'] = element.text elif element.name == 'ul': for li in element.find_all('li'): if not isinstance(li, Tag): continue if li.get_text().split(':')[0] == 'T?l?ffüz': pronounce = li.get_text().split(':')[1].strip() elif element.name == 'p': if '<div class="NavHead" #FFFFE0">' in element.text: for translation, lang, lang_code in self.parse_translation_table(\ element.find_next_sibling('div', class_='NavFrame')): if translation == '': continue translation = translation.strip() lang = lang.strip() yield ( self.edition, page_state['headword'], page_state['headword_lang'], translation, lang, lang_code, page_state['part_of_speech'], pronounce)
def generate_translation_tuples(self, soup): """ A generator of translation tuples :param soup: BeautifulSoup object :return: tuple of the form (edition, headword, head_lang, translation, trans_lang, trans_lang_code, part_of_speech, pronunciation) """ # this is the table of contents # it is present in every page in the French edition toc = soup.find('div', id='mw-content-text') # set default values for tuple elements page_state = {'headword': '', 'headword_lang': '', 'part_of_speech': '', 'pronunciation': ''} edition = "nl" if not toc: return # skip it if there's no table of contents for element in toc.children: if isinstance(element, Tag): # it could be a Tag or a NavigableString level = self.get_heading_level(element.name) if level == 2: # it is a header tag; headword language almost always appears here page_state['headword_lang'] = self.get_heading_text(element) elif level == 4: # it is an h4; part of speech almost always appears here page_state['part_of_speech'] = self.get_heading_text(element) elif element.name == "p": # is a paragraph tag bold_word = element.b if bold_word: page_state['headword'] = bold_word.get_text() # the headword is usually just bolded elif element.name == "h5": first_headline = element.find(class_="mw-headline") if first_headline and first_headline.text.strip() == "Vertalingen": # this is a translation header # this is a translation table while True: # loop through all consecutive tables; they all have translations table = element.find_next_sibling() if table.has_attr("class") and "NavFrame" in table.get("class"): for translation, lang, lang_code in self.parse_translation_table(table): yield ( edition, page_state['headword'], page_state['headword_lang'], translation, lang, lang_code, page_state['part_of_speech'], page_state['pronunciation']) element = table # move to next table else: break
def generate_translation_tuples(self, soup): """ A generator of translation tuples :param soup: BeautifulSoup object :return: tuple of the form (edition, headword, head_lang, translation, trans_lang, trans_lang_code, part_of_speech) """ # START non-edition-specific # this is the table of content which is present in each edition toc = soup.find('div', id='mw-content-text') page_state = {'headword': None, 'headword_lang': None, 'part_of_speech': ''} pronounce = '' head = soup.find('h1', id='titleHeading') if head is not None: page_state['headword'] = head.text for element in toc.children: if isinstance(element, Tag): # it could be a Tag or a NavigableString level = self.get_heading_level(element.name) # END non-edition-specific # Find the headword language if level == 1: if element.big is not None: page_state['headword_lang'] = remove_parenthesis(element.b.text).strip() # Find Part of Speech: Not sure if this works. The only way i've been able to see a correlation between # All pages for part of speech is by it being a h2 and the POS in a font tag. Since my sample test is so small # I don't know if it's working properly if level == 2: if element.text is not None: page_state['part_of_speech'] = element.text.strip() # Find the translation table elif element.name == 'ul': for translation, lang, lang_code in self.parse_translation_table(element): yield ( self.edition, page_state['headword'], page_state['headword_lang'], translation, lang, lang_code, page_state['part_of_speech'], pronounce) translation_table = False
def parse_page(self, soup): """ Yield for each language section """ # try: # page_heading = soup.find('div', class_='mw-body-content').previous_sibling.text # except AttributeError as e: # print(soup) page_content = soup.find('div', id='mw-content-text') page_heading = None element = soup.find('div', class_='mw-body-content') or page_content while not page_heading: if element is None: return None element = element.previous_sibling if isinstance(element, Tag): page_heading = element.text page_state = {'headword': None, 'headword_lang': '', 'part_of_speech': [''], 'pronunciation': '', 'translations': defaultdict(list)} for element in page_content.children: if isinstance(element, Tag): pronunciation = element.find(class_="IPA") if pronunciation: page_state['pronunciation'] = pronunciation.text.strip() level = self.get_heading_level(element.name) if level == 2: if page_state['headword']: yield page_state page_state['headword'] = page_heading # default value page_state['headword_lang'] = self.get_heading_text(element) page_state['part_of_speech'] = [''] page_state['pronunciation'] = '' page_state['translations'] = defaultdict(list) elif level == 3: page_state['part_of_speech'].append(self.get_heading_text(element)) # elif element.name == "p": # bold_word = element.b # if bold_word: # page_state['headword'] = bold_word.get_text() elif element.name == 'table' and 'class' in element.attrs and 'translations' in element['class']: translation_tup_list = list(self.parse_translation_table(element)) if not translation_tup_list: continue pos = page_state['part_of_speech'][-1] page_state['translations'][pos] += translation_tup_list if page_state['headword']: yield page_state
def parse_page(self, soup): page_content = soup.find('div', id='mw-content-text') page_heading = None element = soup.find('div', class_='mw-body-content') or page_content while not page_heading: if element is None: return None element = element.previous_sibling if isinstance(element, Tag): page_heading = element.text page_state = {'headword': None, 'headword_lang': '', 'part_of_speech': [''], 'pronunciation': '', 'translations': defaultdict(list)} for element in page_content.children: if isinstance(element, Tag): # it could be a Tag or a NavigableString pronunciation = element.find(class_='IPA') if pronunciation: page_state['pronunciation'] = pronunciation.text.strip() level = self.get_heading_level(element.name) if level == 2: if page_state['headword']: yield page_state page_state['headword'] = page_heading # default value page_state['headword_lang'] = self.get_heading_text(element) page_state['part_of_speech'] = [''] page_state['pronunciation'] = '' page_state['translations'] = defaultdict(list) elif level == 3: page_state['part_of_speech'].append(self.get_heading_text(element)) # elif element.name == "p": # is a paragraph tag # bold_word = element.b # if bold_word: # page_state['headword'] = bold_word.get_text() # print("headword: ", bold_word.get_text().strip()) elif element.name == "h4": first_headline = element.find() if first_headline and first_headline.text.strip() == u"D?ch": # this translation header table = element.find_next_sibling(class_="columns") translation_tup_list = list(self.parse_translation_table(table)) if not translation_tup_list: continue pos = page_state['part_of_speech'][-1] page_state['translations'][pos] += translation_tup_list if page_state['headword']: yield page_state
def generate_translation_tuples(self, soup): """ A generator of translation tuples :param soup: BeautifulSoup object :return: tuple of the form (edition, headword, head_lang, translation, trans_lang, trans_lang_code, part_of_speech, pronunciation) """ # this is the table of contents # it is present in every page in the French edition toc = soup.find('div', id='mw-content-text') # set default values for tuple elements page_state = {'headword': '', 'headword_lang': '', 'part_of_speech': '', 'pronunciation': ''} edition = "fr" if not toc: return # skip it if there's no table of contents for element in toc.children: if isinstance(element, Tag): # it could be a Tag or a NavigableString level = self.get_heading_level(element.name) if level == 2: # it is a header tag; headword language almost always appears here page_state['headword_lang'] = self.get_heading_text(element).strip() elif level == 3: # it is an h3; part of speech almost always appears here page_state['part_of_speech'] = self.get_heading_text(element).strip() elif element.name == "p": # is a paragraph tag bold_word = element.b if bold_word: page_state['headword'] = bold_word.get_text() # the headword is usually just bolded link = element.span # pronunciation usually appears right after headword in an <a> tag if link: if link.has_attr('class') and "API" in link.get("class"): page_state['pronunciation'] = link.get_text() elif element.name == "h4": first_headline = element.find("span") if first_headline and first_headline.text.strip() == "Traductions": # this is a translation header # this is a translation table while True: # loop through all consecutive tables; they all have translations table = element.find_next_sibling() if table.has_attr("class") and "boite" in table.get("class"): for translation, lang, lang_code in self.parse_translation_table(table): yield ( edition, page_state['headword'], page_state['headword_lang'], translation, lang, lang_code, page_state['part_of_speech'], page_state['pronunciation']) element = table # move to next table else: break
def parse_page(self, soup): page_content = soup.find('div', id='mw-content-text') page_heading = None element = soup.find('div', class_='mw-body-content') or page_content while not page_heading: if element is None: return None element = element.previous_sibling if isinstance(element, Tag): page_heading = element.text page_state = {'headword': None, 'headword_lang': '', 'part_of_speech': [''], 'pronunciation': '', 'translations': defaultdict(list)} for element in page_content.children: if isinstance(element, Tag): pronunciation = element.find(class_="ipa") if pronunciation: page_state['pronunciation'] = pronunciation.text.strip() level = self.get_heading_level(element.name) if level == 2: if page_state['headword']: yield page_state s = self.get_heading_text(element) # format: "headword (language)" page_state['headword'] = s.split('(')[0].strip() or page_heading page_state['headword_lang'] = s[s.find("(") + 1:s.find(")")] page_state['translation_region'] = False page_state['part_of_speech'] = [''] page_state['pronunciation'] = '' page_state['translations'] = defaultdict(list) elif level == 3: page_state['part_of_speech'].append(self.get_heading_text(element).split(',')[0].strip()) page_state['translation_region'] = False elif element.name == "h4": if element.text.strip() == u"Übersetzungen": page_state['translation_region'] = True continue first_headline = element.find() if first_headline and first_headline.text.strip() == u"Übersetzungen": page_state['translation_region'] = True else: page_state['translation_region'] = False elif 'class' not in element.attrs: page_state['translation_region'] = False elif page_state['translation_region']: translation_tup_list = list(self.parse_translation_table(element)) if not translation_tup_list: continue pos = page_state['part_of_speech'][-1] page_state['translations'][pos] += translation_tup_list if page_state['headword']: yield page_state
def fetch_episode_of_bangumi(self, bangumi_id, subtitle_list=None, max_page=MAX_PAGE): """ get all episode by bangumi id example
[ { "download": "magnet:?xt=urn:btih:e43b3b6b53dd9fd6af1199e112d3c7ff15cab82c", "name": "????", "subtitle_group": "58a9c1c9f5dc363606ab42ec", "title": "?????????????[????/Made in Abyss][07][GB][720P]", "episode": 0, "time": 1503301292 }, ] ``` :param bangumi_id: bangumi_id :param subtitle_list: list of subtitle group :type subtitle_list: list :param max_page: how many page you want to crawl if there is no subtitle list :type max_page: int :return: list of bangumi :rtype: list[dict] """ result = [] if os.environ.get('DEBUG', False): print(server_root + 'Bangumi/{}'.format(bangumi_id)) r = network.get(server_root + 'Home/Bangumi/{}'.format(bangumi_id)).text soup = BeautifulSoup(r, 'lxml') # name = soup.find('p', class_='bangumi-title').text container = soup.find('div', class_='central-container') # type:bs4.Tag episode_container_list = {} for index, tag in enumerate(container.contents): if hasattr(tag, 'attrs'): subtitle_id = tag.attrs.get('id', False) if subtitle_list: if subtitle_id in subtitle_list: episode_container_list[tag.attrs.get('id', None)] = tag.find_next_sibling('table') else: if subtitle_id: episode_container_list[tag.attrs.get('id', None)] = tag.find_next_sibling('table') for subtitle_id, container in episode_container_list.items(): for tr in container.find_all('tr')[1:]: title = tr.find('a', class_='magnet-link-wrap').text time_string = tr.find_all('td')[2].string result.append({ 'download': tr.find('a', class_='magnet-link').attrs.get('data-clipboard-text', ''), 'subtitle_group': str(subtitle_id), 'title': title, 'episode': self.parse_episode(title), 'time': int(time.mktime(time.strptime(time_string, "%Y/%m/%d %H:%M"))) }) return result
def scrape_tag_contents(tags, html): tag_list = copy.copy(tags) if isinstance(html, Tag): soup = html else: soup = BeautifulSoup(html, "lxml") results = [] content_tag, content_attr = tag_list.pop() if not len(tag_list): return list(soup.findAll(name=content_tag, attrs=content_attr)) first_tag, first_attr = tag_list.pop(0) element_list = soup.findAll(name=first_tag, attrs=first_attr) for tag, attr in tag_list: temp = ResultSet([], ()) for element in element_list: if isinstance(attr, dict): temp += element.findAll(name=tag, attrs=attr) elif isinstance(attr, unicode) or isinstance(attr, str): if element.has_attr(attr): temp.append(element[attr]) element_list = temp for element in element_list: if content_tag == "regex": pattern = content_attr text = element if not isinstance(text, str): text = element.text if text: match = re.findall(pattern, text) if match: results.append(match[0]) elif content_attr is None or content_attr == "": if content_tag is None or content_tag == "": text = element else: text = element.find(content_tag) if text: results.append(text.text) elif content_tag is None or content_tag == "": if element.has_attr(content_attr): results.append(element[content_attr]) else: info_container = element.findAll(name=content_tag) for container in info_container: if isinstance(content_attr, dict): results.append(container) elif info_container.has_attr(content_attr): results.append(container[content_attr]) return results