/** * 每日推荐 * * @param str */ public List<IHistoryDailyPicks> takeDailyPick(String str) { Document document = Jsoup.parse(str); List<IHistoryDailyPicks> dailyPicksList = new ArrayList<>(); Elements element = document.getElementsByClass("tuijian").get(0) .getElementsByClass("box"); for (Element element2 : element) { IHistoryDailyPicks dailyPicks = new IHistoryDailyPicks(); Element info = element2.getElementsByClass("info").get(0); dailyPicks.setTitle(info.getElementsByTag("a").text());// title String time = info.getElementsByClass("time").text().trim(); dailyPicks.setTime(time.substring(0, time.length() - 1));// time dailyPicks.setDiscuss(info.getElementsByClass("pinglun").text());// Discuss dailyPicks.setDescribe(element2.getElementsByClass("info1").text());// Describe dailyPicks.setHref(AppUtils.Constants.URL_ILISHI + info.getElementsByTag("a").attr("href"));// Href dailyPicks.setImgHref(AppUtils.Constants.URL_ILISHI + element2.getElementsByTag("img").attr("src"));// imgHref dailyPicksList.add(dailyPicks); } return dailyPicksList; }
public static String clientId() throws ReCaptchaException, IOException, RegexException { if (clientId != null && !clientId.isEmpty()) return clientId; Downloader dl = NewPipe.getDownloader(); String response = dl.download("https://soundcloud.com"); Document doc = Jsoup.parse(response); // TODO: Find a less heavy way to get the client_id // Currently we are downloading a 1MB file (!) just to get the client_id, // youtube-dl don't have a way too, they are just hardcoding and updating it when it becomes invalid. // The embed mode has a way to get it, but we still have to download a heavy file (~800KB). Element jsElement = doc.select("script[src^=https://a-v2.sndcdn.com/assets/app]").first(); String js = dl.download(jsElement.attr("src")); return clientId = Parser.matchGroup1(",client_id:\"(.*?)\"", js); }
public TeachersData getTeachersData() throws IOException { Document doc = snp.getSnPPageDocument(SCHOOL_PAGE_URL); Elements rows = doc.select(".mainContainer > table tbody tr"); String description = doc.select(".mainContainer > p").first().text(); List<Subject> subjects = new ArrayList<>(); for (Element subject : rows) { subjects.add(new Subject() .setName(subject.select("td").get(1).text()) .setTeachers(subject.select("td").get(2).text().split(", ")) ); } return new TeachersData() .setClassName(description.split(", ")[0].split(": ")[1].trim()) .setClassTeacher(description.split("Wychowawcy:")[1].trim().split(", ")) .setSubjects(subjects); }
private ArrayList<Topic> getTopTopics(Document doc) throws Exception{ ArrayList<Topic> topTopics = new ArrayList<>(); Elements elements = doc.getElementsByClass("col-12 col-sm-6 col-md-4 mb-4"); for (Element element : elements) { Element idElement = element.select("a").first(); Element imageElement = element.select("a > img").first(); Element titleElement = element.select("a > p").get(0); Element descElement = element.select("a > p").get(1); String id = idElement.attr("href"); id = id.substring(id.lastIndexOf("/") + 1); String name = titleElement.textNodes().get(0).text(); String desc = descElement.textNodes().get(0).text(); String image = imageElement == null ? null : imageElement.attr("src"); Topic topic = new Topic() .setId(id) .setName(name) .setDesc(desc) .setImage(image); topTopics.add(topic); } return topTopics; }
private void savePandaLivesToRedis(Document document) { List<Video> lives = new ArrayList<>(); Elements elements = document.select("li.video-list-item.video-no-tag"); for (Element element : elements) { Video videoDTO = new Video(); String title = element.select("div.video-info span.video-nickname").text(); String image = element.select("img.video-img").attr("data-original"); image = image.replace("http:", ""); String url = PANDA + element.attr("data-id"); videoDTO.setTitle(title); videoDTO.setImage(image); videoDTO.setValue(url); lives.add(videoDTO); if (lives.size() > 48) { break; } } String key = redisSourceManager.VIDEO_PREFIx_HOME_LIVE_KEY + "_" + TAG; redisSourceManager.saveVideos(key, lives); }
@Override public StreamInfoItemCollector getRelatedVideos() throws IOException, ExtractionException { assertPageFetched(); try { StreamInfoItemCollector collector = new StreamInfoItemCollector(getServiceId()); Element ul = doc.select("ul[id=\"watch-related\"]").first(); if (ul != null) { for (Element li : ul.children()) { // first check if we have a playlist. If so leave them out if (li.select("a[class*=\"content-link\"]").first() != null) { collector.commit(extractVideoPreviewInfo(li)); } } } return collector; } catch (Exception e) { throw new ParsingException("Could not get related videos", e); } }
private static void removeUselessElements(Element element) { for (Element child : element.children()) { if (child.children().size() > 0) removeUselessElements(child); else { switch (child.tagName()) { case "br": case "a": case "p": case "h1": case "h2": case "h3": case "h4": case "span": break; default: Element parent = child.parent(); child.remove(); parent.insertChildren(0, child.children()); break; } } } }
private Object getValueText(Elements elements, JsoupExpression jsoupExpression) { if (elements == null || elements.isEmpty()) { return null; } Element element = elements.get(0); if (jsoupExpression.isTextMethod()) { return StringUtils.trim(element.text()); } if (jsoupExpression.isValMethod()) { return StringUtils.trim(element.val()); } if (jsoupExpression.isAttrMethod()) { return StringUtils.trim(element.attr(jsoupExpression.getParameter())); } if (jsoupExpression.isOuterHtmlMethod()) { return StringUtils.trim(element.outerHtml()); } if (jsoupExpression.isOwnTextMethod()) { return StringUtils.trim(element.ownText()); } if (jsoupExpression.isHtmlMethod()) { return StringUtils.trim(element.html()); } return StringUtils.trim(element.text()); }
@Test public void leagueStatusCheck() throws Exception { Document doc = Jsoup.connect("http://www.espn.com/wnba/scoreboard/_/group/50") .timeout(60 * 1000) .maxBodySize(0) .get(); Elements scriptElements = doc.getElementsByTag("script"); Pattern pattern = Pattern.compile("window.espn.scoreboardData[\\s\t]*= (.*);.*window.espn.scoreboardSettings.*"); for (Element element : scriptElements) { for (DataNode node : element.dataNodes()) { if (node.getWholeData().startsWith("window.espn.scoreboardData")) { Matcher matcher = pattern.matcher(node.getWholeData()); if (matcher.matches()) { Gson gson = new Gson(); EspnJson espnJson = new Gson().fromJson(matcher.group(1), EspnJson.class); System.out.println(espnJson.getTeams()); assertEquals(false, espnJson.getTeams().isEmpty()); } } } } }
private Game constructGameFromHtmlBlock(Element currentHtmlBlock) { Game gameFromHtmlBlock = DefaultFactory.Game.constructDefault(); gameFromHtmlBlock.setScoreType(getScoreType()); gameFromHtmlBlock.setLeagueType(this); Elements updatedHtmlBlocks = currentHtmlBlock.select("td"); boolean once = true; int position = 0; for (Element currentColumnBlock : updatedHtmlBlocks) { if (once) { once = false; createGameInfo(Jsoup.parse(currentColumnBlock.html().replaceAll("(?i)<br[^>]*>", "br2n")).text(), gameFromHtmlBlock); } else { createBidInfo(Jsoup.parse(currentColumnBlock.html().replaceAll("(?i)<br[^>]*>", "br2n")).text(), gameFromHtmlBlock, position == 2); } position++; } gameFromHtmlBlock.setVIBid(); gameFromHtmlBlock.createID(); return gameFromHtmlBlock; }
/** * 获取strings.xml 资源名-值 映射表 * * @return */ protected Map<String, String> getStringResNameAndValueMap() { Map<String, String> map = new HashMap<>(); Document document = getValuesXmlDocument(); Elements strings = document.getElementsByTag("string"); for (int i = 0; i < strings.size(); i++) { Element element = strings.get(i); String name = element.attr("name"); if (element.childNodeSize() > 0 && element.childNode(0) instanceof TextNode) { String text = ((TextNode) element.childNode(0)).text(); map.put(name, text); } } return map; }
protected String clean(Element element) { Element work = element.clone(); String[] unwantedElements = "img,script,style,link,canvas".split(","); String[] unwantedAttributes = "style,class,target,id,src".split(","); for (String tag : unwantedElements) { work.getElementsByTag(tag).forEach((el) -> { el.remove(); }); } for (String attr : unwantedAttributes) { work.getElementsByAttribute(attr).forEach((el) -> { el.removeAttr(attr); }); } return work.html().replaceAll("\\s{2,}", " "); }
/** * Login to Flickr. * @return Cookies for logged-in session * @throws IOException */ @SuppressWarnings("unused") private Map<String,String> signinToFlickr() throws IOException { Response resp = Jsoup.connect("http://www.flickr.com/signin/") .userAgent(USER_AGENT) .followRedirects(true) .method(Method.GET) .execute(); Document doc = resp.parse(); Map<String,String> postData = new HashMap<>(); for (Element input : doc.select("input[type=hidden]")) { postData.put(input.attr("name"), input.attr("value")); } postData.put("passwd_raw", ""); postData.put(".save", ""); postData.put("login", new String(Base64.decode("bGVmYWtlZGVmYWtl"))); postData.put("passwd", new String(Base64.decode("MUZha2V5ZmFrZQ=="))); String action = doc.select("form[method=post]").get(0).attr("action"); resp = Jsoup.connect(action) .cookies(resp.cookies()) .data(postData) .method(Method.POST) .execute(); return resp.cookies(); }
public static void main(String[] args) { XxlCrawler crawler = new XxlCrawler.Builder() .setUrls("https://my.oschina.net/xuxueli/blog") .setWhiteUrlRegexs("https://my\\.oschina\\.net/xuxueli/blog/\\d+") .setThreadCount(3) .setPageParser(new PageParser<PageVo>() { @Override public void parse(Document html, Element pageVoElement, PageVo pageVo) { // 解析封装 PageVo 对象 String pageUrl = html.baseUri(); System.out.println(pageUrl + ":" + pageVo.toString()); } }) .build(); System.out.println("start"); crawler.start(true); System.out.println("end"); }
@Override public Map<DetailActivity.parameter, Object> getDetailContent(String baseUrl, String currentUrl, byte[] result, Map<DetailActivity.parameter, Object> resultMap) throws UnsupportedEncodingException { List<PicInfo> urls = new ArrayList<>(); Document document = Jsoup.parse(new String(result, "gb2312")); PicInfo info = new PicInfo(); Elements elements = document.select("div.picsbox p img"); if (elements.size() > 0) info.setPicUrl(elements.get(0).attr("src")); Elements title = document.select(".picmainer h1"); if (title.size() > 0) info.setTitle(title.text()); Elements tags = document.select(".pleft a"); if (tags.size() > 0) { List<String> tagList = new ArrayList<>(); for (Element element : tags) tagList.add(element.text()); info.setTags(tagList); } urls.add(info); resultMap.put(DetailActivity.parameter.CURRENT_URL, currentUrl); resultMap.put(DetailActivity.parameter.RESULT, urls); return resultMap; }
public static List<HtmlCssTheme> popularThemes() { HttpUrl url = HttpUrl.parse(POPULAR_THEMES_URL); Request request = new Request.Builder().url(url).get().build(); String html = Retry.retryUntilSuccessfulWithBackoff( () -> client.newCall(request).execute() ); Elements elements = Jsoup.parse(html).select("script"); Element script = Seq.seq(elements) .filter(e -> { return e.html().startsWith("window.INITIAL_STATE="); }) .findFirst().orElse(null); String rawJson = script.html().substring("window.INITIAL_STATE=".length()); JsonNode node = Json.serializer().nodeFromJson(rawJson); return Seq.seq(node.path("searchPage").path("results").path("matches")) .map(ThemeForestScraper::themeFromElement) .toList(); //.map(ThemeForestScraper::themeFromElement).toList(); }
private void fetchImage() { try { Document doc = Http.url(url).get(); // Find image Elements images = doc.select(".image-container img"); if (images.size() == 0) { logger.warn("Image not found at " + this.url); return; } Element image = images.first(); String imgsrc = image.attr("src"); logger.info("Found URL " + imgsrc); // Provide prefix and let the AbstractRipper "guess" the filename String prefix = ""; if (Utils.getConfigBoolean("download.save_order", true)) { prefix = String.format("%03d_", index); } addURLToDownload(new URL(imgsrc), prefix); } catch (IOException e) { logger.error("[!] Exception while loading/parsing " + this.url, e); } }
private String extractContent(String url) { String summary = ""; try { Document doc = Jsoup.connect(url).get(); Element element = doc.getElementById("intro").child(1); if (element.childNodeSize() == 2) { Element target = element.getElementsByClass("hidden").get(0); summary = target.text(); } else { summary = element.text(); } } catch (Exception e) { e.printStackTrace(); } return summary; }
private void processMessageViews(URL url, String html) { Document document = Jsoup.parse(html); Elements elements = document.select("user-container"); for (Element e : elements) { Elements link = e.select("a"); Element signature = new Element(""); for (Element e1 : link) { if (e1.hasAttr("class") && e1.attr("class").equals("signature")) { signature = e1; break; } } } }
private List<Map<String, String>> getParsedData(Content content){ List<Map<String,String >> testData = new ArrayList<>(); JSONObject jsonObject = new JSONObject(content.toString()); JSONObject body = (JSONObject) jsonObject.get("body"); JSONObject storage = (JSONObject) body.get("storage"); Document doc = Jsoup.parse(storage.get("value").toString()); Elements tRows = doc.select("tr"); LOG.info("Парсим данные страницы"); for (Element row : tRows){ HashMap<String,String> rowData = new HashMap<>(); Elements tds = row.select("td[colspan!="+countColumns+"]"); if (tds.size()!=0 && (!tds.get(0).text().equals(""))){ rowData.put("dKey",tds.get(0).text()); rowData.put("dValue",tds.get(1).text()); rowData.put("dComment",tds.get(2).text()); rowData.put("dHltValue",tds.get(3).text()); rowData.put("dDevValue",tds.get(4).text()); testData.add(rowData); } LOG.debug(row.text()); } return testData; }
@Override public List doAnalysis(String html) { List<Course> list = new ArrayList<Course>(); Element element = null; Elements options = null; element=HTMLUtil.getSelectorByName(html, Constants.HTML_ELEMENT_NAME.SELECT_NAME_COURSE.getValue()); if (element != null) { options = element.children(); Course course = null; for (Element e : options) { if (e.attr("value").equals("")) continue; course = new Course(); course.setName(HTMLUtil.cutName(e.text())); course.setNo(e.attr("value")); course.setNameNo(HTMLUtil.cutNameNo(e.text())); list.add(course); } } return list; }
@Override public Map<DetailActivity.parameter, Object> getDetailContent(String baseUrl, String currentUrl, byte[] result, Map<DetailActivity.parameter, Object> resultMap) throws UnsupportedEncodingException { List<PicInfo> urls = new ArrayList<>(); Document document = Jsoup.parse(new String(result, "utf-8")); Elements title = document.select("div.album_desc div.inline"); String sTitle = ""; if (title.size() > 0) sTitle = title.get(0).text(); Elements elements = document.select(".gallary_item .pic_box img"); for (Element element : elements) { urls.add(new PicInfo(baseUrl + element.attr("src")).setTitle(sTitle)); } resultMap.put(DetailActivity.parameter.CURRENT_URL, currentUrl); resultMap.put(DetailActivity.parameter.RESULT, urls); return resultMap; }
public void populatePointsGivenToRB(Team team) { int rankCounter = 0; Element rbPointsAllowedTable = rbPointsAllowedURL.select("table").get(1); Elements rbPointsAllowedRows = rbPointsAllowedTable.select("tr"); for (int i = 2; i < rbPointsAllowedRows.size(); i++) { Element row = rbPointsAllowedRows.get(i); Elements cols = row.select("td"); rankCounter++; if(cols.get(0).text().contains(team.getName())) { if (cols.get(2).text().contains("*")) { team.setFpToRBRank(rankCounter); team.setFpToRBAvg(Double.parseDouble(cols.get(18).text())); break; } else { team.setFpToRBRank(rankCounter); team.setFpToRBAvg(Double.parseDouble(cols.get(19).text())); break; } } } }
public List<MagneticModel> getList() { List<MagneticModel> listModels = new ArrayList<>(); MagneticModel magneticModel; Elements a = document.select("div.r"); int size = a.size(); for (int i = 0; i < size; i++) { if (i != size - 1) { Element element = a.get(i); magneticModel = new MagneticModel(); magneticModel.title = element.select("a[class]").text(); magneticModel.url = element.select("a:not(.link)").attr("href"); listModels.add(magneticModel); } } return listModels; }
private void appendTextSkipHidden(Element e, StringBuilder accum, int indent) { for (Node child : e.childNodes()) { if (unlikely(child)) { continue; } if (child instanceof TextNode) { TextNode textNode = (TextNode) child; String txt = textNode.text(); accum.append(txt); } else if (child instanceof Element) { Element element = (Element) child; if (accum.length() > 0 && element.isBlock() && !lastCharIsWhitespace(accum)) accum.append(' '); else if (element.tagName().equals("br")) accum.append(' '); appendTextSkipHidden(element, accum, indent + 1); } } }
@Override public void writeDesign(Element design, DesignContext designContext) { super.writeDesign(design, designContext); if (currentTimeFormat != null) { design.attr("time-format", currentTimeFormat == TimeFormat.Format12H ? "12h" : "24h"); } if (startDate != null) { design.attr("start-date", DATE_FORMAT.format(getStartDate())); } if (endDate != null) { design.attr("end-date", DATE_FORMAT.format(getEndDate())); } if (!getZoneId().equals(ZoneId.systemDefault())) { design.attr("time-zone", getZoneId().getId()); } }
@Override public List<String> getURLsFromPage(Document page) { Elements elements = page.select(".shm-thumb.thumb>a").not(".shm-thumb-link"); List<String> res = new ArrayList<>(elements.size()); for (Element e : elements) { res.add(e.absUrl("href")); } return res; }
/** * 获取全部节点的内部的html * @param context * @return */ public List<JXNode> html(Elements context){ List<JXNode> res = new LinkedList<JXNode>(); if (context!=null&&context.size()>0){ for (Element e:context){ res.add(JXNode.t(e.html())); } } return res; }
private int calcWeight(Element e) { int weight = 0; if (POSITIVE.matcher(e.className()).find()) weight += 35; if (POSITIVE.matcher(e.id()).find()) weight += 45; if (UNLIKELY.matcher(e.className()).find()) weight -= 20; if (UNLIKELY.matcher(e.id()).find()) weight -= 20; if (NEGATIVE.matcher(e.className()).find()) weight -= 50; if (NEGATIVE.matcher(e.id()).find()) weight -= 50; String style = e.attr("style"); if (style != null && !style.isEmpty() && NEGATIVE_STYLE.matcher(style).find()) weight -= 50; String itemprop = e.attr("itemprop"); if (itemprop != null && !itemprop.isEmpty() && POSITIVE.matcher(itemprop).find()) { weight += 100; } return weight; }
@Override public List<String> getURLsFromPage(Document page) { List<String> imageURLs = new ArrayList<>(); Elements thumbs = page.select(".gallerythumb"); for (Element el : thumbs) { String imageUrl = el.attr("href"); imageURLs.add("https://nhentai.net" + imageUrl); } return imageURLs; }
@Override public List<Proxy> parseProxys(String content) { Document doc = Jsoup.parse(content); Elements elements = doc.select("div#list table tbody tr"); List<Proxy> proxyList = new ArrayList<>(); for(Element tr : elements){ Elements tds = tr.children(); String ip = tds.get(0).text().trim(); Integer port = Integer.parseInt(tds.get(1).text()); proxyList.add(new Proxy(ip,port)); } return proxyList; }
/** * 推荐阅读 * * @param str */ public List<IHistoryOldPhoto> takeProposeRead(String str) { Document document = Jsoup.parse(str); List<IHistoryOldPhoto> proposeReads = new ArrayList<>(); Elements elements = document.getElementsByClass("oldpic"); Elements element = elements.get(0).getElementsByTag("li"); for (Element element2 : element) { IHistoryOldPhoto proposeRead = new IHistoryOldPhoto(); proposeRead.setTitle(element2.getElementsByTag("img").attr("title"));//title proposeRead.setHref(AppUtils.Constants.URL_ILISHI + element2.getElementsByTag("a").attr("href"));//href proposeRead.setImgHref(AppUtils.Constants.URL_ILISHI + element2.getElementsByTag("img").attr("src"));//imgHref proposeReads.add(proposeRead); } return proposeReads; }
@Override public Element selectElement(Element element) { Elements elements = element.select(selectorText); if (Preconditions.isNotBlank(elements)) { return elements.get(0); } return null; }
private static Element getSeItemContentBody(SeItem item) { Document doc; Element body = null; try { doc = Jsoup.connect(item.getId().toString()).get(); body = doc.body(); } catch (IOException e) { LOG.debug("Failed to parse se-item '{}'", item.getName()); } return body; }
private List<DuYinDM> gatherDuyins(Element contentEL)throws Exception{ Elements elements=contentEL.select("p"); DuYinDM dm=null; List<DuYinDM> results=new ArrayList<DuYinDM>(3); for (Element p : elements) { if(p.children().isEmpty())continue; Element firstChild=p.child(0); if("span".equals(firstChild.tagName())){ if(firstChild.hasClass("dicpy")){ if(dm!=null){ results.add(dm); } dm=new DuYinDM(); String duyin=firstChild.text(); dm.setDuyin(duyin); } }else if("em".equals(firstChild.tagName())){ StringBuilder ziyi=new StringBuilder(); Node next=firstChild.nextSibling(); while(next!=null){ if(next instanceof TextNode){ ziyi.append(((TextNode) next).text()); }else if(next instanceof Element){ ziyi.append(((Element) next).text()); } next=next.nextSibling(); } dm.addZiyi(ziyi.toString()); } } if(dm!=null){ results.add(dm); } return results; }
@Override public List<String> getURLsFromPage(Document doc) { List<String> result = new ArrayList<String>(); for (Element el : doc.select("div.block-content > div > div.img-container > a > img.img-responsive")) { String imageURL = "https:" + el.attr("src"); imageURL = imageURL.replace("hentaicdn.com", "static.hentaicdn.com"); imageURL = imageURL.replace("thumbnails/", ""); imageURL = imageURL.replace("tmb", ""); result.add(imageURL); } return result; }
/** * 节点后面的全部同胞节点following-sibling * @param e * @return */ public Elements followingSibling(Element e){ Elements rs = new Elements(); Element tmp = e.nextElementSibling(); while (tmp!=null){ rs.add(tmp); tmp = tmp.nextElementSibling(); } return rs; }
/** * Extract first element according to a query */ private static Element element(Element container, String query) { Elements select = container.select(query); if (select.size() == 0) { throw new ElementNotFoundException(query); } return select.first(); }
public boolean matches(Element root, Element element) { if (root == element) return false; Element parent = element.parent(); while (true) { if (evaluator.matches(root, parent)) return true; if (parent == root) break; parent = parent.parent(); } return false; }
private static String GetDivContent(Element div) { StringBuilder sb = new StringBuilder(); //考虑div里标签内容的顺序,对div子树进行深度优先搜索 Stack<Element> sk = new Stack<Element>(); sk.push(div); while (!sk.empty()) { // Element e = sk.pop(); //对于div中的div过滤掉 if (e != div && e.tagName().equals("div")) continue; //考虑正文被包含在p标签中的情况,并且p标签里不能含有a标签 if (e.tagName().equals("p") && e.getElementsByTag("a").size() == 0) { String className = e.className(); if (className.length() != 0 && className.equals("pictext")) continue; sb.append(e.text()); sb.append("\n"); continue; } else if (e.tagName().equals("td")) { //考虑正文被包含在td标签中的情况 if (e.getElementsByTag("div").size() != 0) continue; sb.append(e.text()); sb.append("\n"); continue; } //将孩子节点加入栈中 Elements children = e.children(); for (int i = children.size() - 1; i >= 0; i--) { sk.push((Element) children.get(i)); } } return sb.toString(); }