public void configure(JobConf job) { // 'key' == sortInput for sort-input; key == sortOutput for sort-output key = deduceInputFile(job); if (key == sortOutput) { partitioner = new HashPartitioner<WritableComparable, Writable>(); // Figure the 'current' partition and no. of reduces of the 'sort' try { URI inputURI = new URI(job.get(JobContext.MAP_INPUT_FILE)); String inputFile = inputURI.getPath(); // part file is of the form part-r-xxxxx partition = Integer.valueOf(inputFile.substring( inputFile.lastIndexOf("part") + 7)).intValue(); noSortReducers = job.getInt(SORT_REDUCES, -1); } catch (Exception e) { System.err.println("Caught: " + e); System.exit(-1); } } }
/** * Prints the content of the Node represented by the url to system out. * * @param webGraphDb * The webgraph from which to get the node. * @param url * The url of the node. * * @throws IOException * If an error occurs while getting the node. */ public void dumpUrl(Path webGraphDb, String url) throws IOException { fs = FileSystem.get(getConf()); nodeReaders = MapFileOutputFormat.getReaders(fs, new Path(webGraphDb, WebGraph.NODE_DIR), getConf()); // open the readers, get the node, print out the info, and close the readers Text key = new Text(url); Node node = new Node(); MapFileOutputFormat.getEntry(nodeReaders, new HashPartitioner<Text, Node>(), key, node); System.out.println(url + ":"); System.out.println(" inlink score: " + node.getInlinkScore()); System.out.println(" outlink score: " + node.getOutlinkScore()); System.out.println(" num inlinks: " + node.getNumInlinks()); System.out.println(" num outlinks: " + node.getNumOutlinks()); FSUtils.closeReaders(nodeReaders); }
public void configure(JobConf job) { // 'key' == sortInput for sort-input; key == sortOutput for sort-output key = deduceInputFile(job); if (key == sortOutput) { partitioner = new HashPartitioner<WritableComparable, Writable>(); // Figure the 'current' partition and no. of reduces of the 'sort' try { URI inputURI = new URI(job.get("map.input.file")); String inputFile = inputURI.getPath(); partition = Integer.valueOf( inputFile.substring(inputFile.lastIndexOf("part")+5) ).intValue(); noSortReducers = job.getInt("sortvalidate.sort.reduce.tasks", -1); } catch (Exception e) { System.err.println("Caught: " + e); System.exit(-1); } } }
/** * Prints loopset for a single url. The loopset information will show any * outlink url the eventually forms a link cycle. * * @param webGraphDb The WebGraph to check for loops * @param url The url to check. * * @throws IOException If an error occurs while printing loopset information. */ public void dumpUrl(Path webGraphDb, String url) throws IOException { // open the readers fs = FileSystem.get(getConf()); loopReaders = MapFileOutputFormat.getReaders(fs, new Path(webGraphDb, Loops.LOOPS_DIR), getConf()); // get the loopset for a given url, if any Text key = new Text(url); LoopSet loop = new LoopSet(); MapFileOutputFormat.getEntry(loopReaders, new HashPartitioner<Text, LoopSet>(), key, loop); // print out each loop url in the set System.out.println(url + ":"); for (String loopUrl : loop.getLoopSet()) { System.out.println(" " + loopUrl); } // close the readers FSUtils.closeReaders(loopReaders); }
/** * Prints the content of the Node represented by the url to system out. * * @param webGraphDb The webgraph from which to get the node. * @param url The url of the node. * * @throws IOException If an error occurs while getting the node. */ public void dumpUrl(Path webGraphDb, String url) throws IOException { fs = FileSystem.get(getConf()); nodeReaders = MapFileOutputFormat.getReaders(fs, new Path(webGraphDb, WebGraph.NODE_DIR), getConf()); // open the readers, get the node, print out the info, and close the readers Text key = new Text(url); Node node = new Node(); MapFileOutputFormat.getEntry(nodeReaders, new HashPartitioner<Text, Node>(), key, node); System.out.println(url + ":"); System.out.println(" inlink score: " + node.getInlinkScore()); System.out.println(" outlink score: " + node.getOutlinkScore()); System.out.println(" num inlinks: " + node.getNumInlinks()); System.out.println(" num outlinks: " + node.getNumOutlinks()); FSUtils.closeReaders(nodeReaders); }
public CrawlDatum get(String crawlDb, String url, JobConf config) throws IOException { Text key = new Text(url); CrawlDatum val = new CrawlDatum(); openReaders(crawlDb, config); CrawlDatum res = (CrawlDatum) MapFileOutputFormat.getEntry(readers, new HashPartitioner<Text, CrawlDatum>(), key, val); return res; }
public static void main(String[] args) throws Exception { if (args == null || args.length < 2) { System.out.println("LinkDumper$Reader usage: <webgraphdb> <url>"); return; } // open the readers for the linkdump directory Configuration conf = NutchConfiguration.create(); FileSystem fs = FileSystem.get(conf); Path webGraphDb = new Path(args[0]); String url = args[1]; MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, new Path( webGraphDb, DUMP_DIR), conf); // get the link nodes for the url Text key = new Text(url); LinkNodes nodes = new LinkNodes(); MapFileOutputFormat.getEntry(readers, new HashPartitioner<Text, LinkNodes>(), key, nodes); // print out the link nodes LinkNode[] linkNodesAr = nodes.getLinks(); System.out.println(url + ":"); for (LinkNode node : linkNodesAr) { System.out.println(" " + node.getUrl() + " - " + node.getNode().toString()); } // close the readers FSUtils.closeReaders(readers); }
public CrawlDatum get(String crawlDb, String url, Configuration config) throws IOException { Text key = new Text(url); CrawlDatum val = new CrawlDatum(); openReaders(crawlDb, config); CrawlDatum res = (CrawlDatum)MapFileOutputFormat.getEntry(readers, new HashPartitioner<Text, CrawlDatum>(), key, val); return res; }
/** * Get the user's original partitioner. * @param conf the configuration to look in * @return the class that the user submitted */ static Class<? extends Partitioner> getJavaPartitioner(JobConf conf) { return conf.getClass(Submitter.PARTITIONER, HashPartitioner.class, Partitioner.class); }
/** * Get the user's original partitioner. * @param conf the configuration to look in * @return the class that the user submitted */ static Class<? extends Partitioner> getJavaPartitioner(JobConf conf) { return conf.getClass("hadoop.pipes.partitioner", HashPartitioner.class, Partitioner.class); }