public static void interleaveSplitFastq(FileStatus fst, FileStatus fst2, String splitDir, int splitlen, JavaSparkContext sc) throws IOException { List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen); List<FileSplit> nlif2 = NLineInputFormat.getSplitsForFile(fst2, sc.hadoopConfiguration(), splitlen); JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif); JavaRDD<FileSplit> splitRDD2 = sc.parallelize(nlif2); JavaPairRDD<FileSplit, FileSplit> zips = splitRDD.zip(splitRDD2); zips.foreach( splits -> { Path path = splits._1.getPath(); FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), splits._1); FastqRecordReader fqreader2 = new FastqRecordReader(new Configuration(), splits._2); writeInterleavedSplits(fqreader, fqreader2, new Configuration(), splitDir+"/"+path.getParent().getName()+"_"+splits._1.getStart()+".fq"); }); }
public static void main(String[] args) throws InterruptedException { SparkConf sc = new SparkConf().setAppName("POC-Streaming"); try(JavaStreamingContext jsc = new JavaStreamingContext(sc, new Duration(2000))) { //JavaDStream<SampleXML> records = jsc.textFileStream("input/").map(new ParseXML()); //textFileStream process lines of files, so xml has to be 1 line to work //alternative below JavaRDD<String> files = jsc.sparkContext().wholeTextFiles("input/").map(tuple -> tuple._2()); Queue<JavaRDD<String>> rddQueue = new LinkedList<>(); rddQueue.add(files); JavaDStream<String> records = jsc.queueStream(rddQueue); records.foreachRDD(rdd -> System.out.printf("Amount of XMLs: %d\n", rdd.count())); jsc.start(); jsc.awaitTermination(); } }
/** * @param sparkContext active Spark Context * @param trainData training data on which to build a model * @param hyperParameters ordered list of hyper parameter values to use in building model * @param candidatePath directory where additional model files can be written * @return a {@link PMML} representation of a model trained on the given data */ @Override public PMML buildModel(JavaSparkContext sparkContext, JavaRDD<String> trainData, List<?> hyperParameters, Path candidatePath) { int numClusters = (Integer) hyperParameters.get(0); Preconditions.checkArgument(numClusters > 1); log.info("Building KMeans Model with {} clusters", numClusters); JavaRDD<Vector> trainingData = parsedToVectorRDD(trainData.map(MLFunctions.PARSE_FN)); KMeansModel kMeansModel = KMeans.train(trainingData.rdd(), numClusters, maxIterations, numberOfRuns, initializationStrategy); return kMeansModelToPMML(kMeansModel, fetchClusterCountsFromModel(trainingData, kMeansModel)); }
public TrainedModel findBestModel(JavaRDD<Rating> ratings) { double weights[] = {6, 2, 2}; JavaRDD<Rating>[] randomRatings = ratings.randomSplit(weights, 0L); JavaRDD<Rating> trainingRdd = randomRatings[0]; JavaRDD<Rating> validationRdd = randomRatings[1]; JavaRDD<Rating> testRdd = randomRatings[2]; TrainConfig trainConfig = findBestTrainingParameters(trainingRdd, validationRdd); TrainedModel model = ModelFactory.create(trainingRdd, testRdd, trainConfig.getRankNr(), trainConfig.getIterationsNr()); logger.info("best model have RMSE = " + model.getError()); return model; }
public JavaPairRDD<Integer,IntArrayList> parseBlockCollection(JavaRDD<String> blockingInput) { System.out.println("Parsing the blocking collection..."); return blockingInput .map(line -> line.split("\t")) //split to [blockId, [entityIds]] .filter(line -> line.length == 2) //only keep lines of this format .mapToPair(pair -> { int blockId = Integer.parseInt(pair[0]); String[] entities = pair[1].replaceFirst(";", "").split("#"); if (entities == null || entities.length == 0) { return null; } List<Integer> outputEntities = new ArrayList<>(); //possible (but not really probable) cause of OOM (memory errors) if huge blocks exist for (String entity : entities) { if (entity.isEmpty()) continue; //in case the last entityId finishes with '#' Integer entityId = Integer.parseInt(entity); outputEntities.add(entityId); } return new Tuple2<>(blockId, new IntArrayList(outputEntities.stream().mapToInt(i->i).toArray())); }) .filter(x -> x != null); }
private static void splitFastq(FileStatus fst, String fqPath, String splitDir, int splitlen, JavaSparkContext sc) throws IOException { Path fqpath = new Path(fqPath); String fqname = fqpath.getName(); String[] ns = fqname.split("\\."); //TODO: Handle also compressed files List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen); JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif); splitRDD.foreach( split -> { FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), split); writeFastqFile(fqreader, new Configuration(), splitDir + "/split_" + split.getStart() + "." + ns[1]); }); }
private static JavaRDD<String> getUniqueKmers(JavaPairRDD<Text, SequencedFragment> fastqRDD, int k) { JavaRDD<String> rdd = fastqRDD.mapPartitions(records -> { HashSet<String> umer_set = new HashSet<String>(); while (records.hasNext()) { Tuple2<Text, SequencedFragment> fastq = records.next(); String seq = fastq._2.getSequence().toString(); //HashSet<String> umer_in_seq = new HashSet<String>(); for (int i = 0; i < seq.length() - k - 1; i++) { String kmer = seq.substring(i, i + k); umer_set.add(kmer); } } return umer_set.iterator(); }); JavaRDD<String> umersRDD = rdd.distinct(); //umersRDD.sortBy(s -> s, true, 4); return umersRDD; }
private static JavaPairRDD<Text, SequencedFragment> mapSAMRecordsToFastq(JavaRDD<SAMRecord> bamRDD) { JavaPairRDD<Text, SequencedFragment> fastqRDD = bamRDD.mapToPair(read -> { String name = read.getReadName(); if(read.getReadPairedFlag()){ if(read.getFirstOfPairFlag()) name = name+"/1"; if(read.getSecondOfPairFlag()) name = name+"/2"; } //TODO: check values Text t = new Text(name); SequencedFragment sf = new SequencedFragment(); sf.setSequence(new Text(read.getReadString())); sf.setQuality(new Text(read.getBaseQualityString())); return new Tuple2<Text, SequencedFragment>(t, sf); }); return fastqRDD; }
@Test public void testFilterWithoutIgnoreTypes() { InstanceFilterConfig config = new InstanceFilterConfig() .setIgnoreOtherTypes(false) .setTypes(Arrays.asList("http://example.com/a/type", "http://example.com/b/type")); InstanceFilter filter = new InstanceFilter(config); JavaRDD<Instance> result = filter.filter(testRDD, typeIndex); List<Instance> rdd = new ArrayList<>(); Instance one = new Instance(); one.addType(typeIndex.getIndex("http://example.com/a/type")); one.setUri("http://example.com/one"); rdd.add(one); Instance two = new Instance(); two.addType(typeIndex.getIndex("http://example.com/b/type")); two.addType(typeIndex.getIndex("http://example.com/c/type")); two.setUri("http://example.com/two"); rdd.add(two); JavaRDD<Instance> expected = jsc().parallelize(rdd); assertRDDEquals("Expected instances without restricted types were filtered out", expected, result); }
/** * @deprecated use {@link #readEntityIdsMapping(JavaRDD)} instead, to get the entity mappings used in blocking * Maps an entity url to its entity id, that is also used by blocking. * @param rawTriples * @param SEPARATOR * @return a map from an entity url to its entity id, that is also used by blocking. */ public static Object2IntOpenHashMap<String> getEntityIdsMapping(JavaRDD<String> rawTriples, String SEPARATOR) { LinkedHashSet<String> subjectsSet = new LinkedHashSet<>(rawTriples .map(line -> line.split(SEPARATOR)[0]) .collect() ); //convert list to set (to remove duplicates) Object2IntOpenHashMap<String> result = new Object2IntOpenHashMap<>(subjectsSet.size()); result.defaultReturnValue(-1); int index = 0; for (String subject : subjectsSet) { result.put(subject, index++); } return result; }
/** * Returns the latest versions of a given set of concept maps. * * @param urls a set of URLs to retrieve the latest version for, or null to load them all. * @param includeExperimental flag to include concept maps marked as experimental * * @return a map of concept map URLs to the latest version for them. */ public Map<String,String> getLatestVersions(final Set<String> urls, boolean includeExperimental) { // Reduce by the concept map URI to return only the latest version // per concept map. Spark's provided max aggregation function // only works on numeric types, so we jump into RDDs and perform // the reduce by hand. JavaRDD<UrlAndVersion> changes = this.conceptMaps.select(col("url"), col("version"), col("experimental")) .toJavaRDD() .filter(row -> (urls == null || urls.contains(row.getString(0))) && (includeExperimental || row.isNullAt(2) || !row.getBoolean(2))) .mapToPair(row -> new Tuple2<>(row.getString(0), row.getString(1))) .reduceByKey((leftVersion, rightVersion) -> leftVersion.compareTo(rightVersion) > 0 ? leftVersion : rightVersion) .map(tuple -> new UrlAndVersion(tuple._1, tuple._2)); return this.spark.createDataset(changes.rdd(), URL_AND_VERSION_ENCODER) .collectAsList() .stream() .collect(Collectors.toMap(UrlAndVersion::getUrl, UrlAndVersion::getVersion)); }
/** * @param evalData data for evaluation * @return the Dunn Index of a given clustering * (https://en.wikipedia.org/wiki/Cluster_analysis#Internal_evaluation); higher is better */ @Override double evaluate(JavaRDD<Vector> evalData) { // Intra-cluster distance is mean distance to centroid double maxIntraClusterDistance = fetchClusterMetrics(evalData).values().mapToDouble(ClusterMetric::getMeanDist).max(); // Inter-cluster distance is distance between centroids double minInterClusterDistance = Double.POSITIVE_INFINITY; List<ClusterInfo> clusters = new ArrayList<>(getClustersByID().values()); DistanceFn<double[]> distanceFn = getDistanceFn(); for (int i = 0; i < clusters.size(); i++) { double[] centerI = clusters.get(i).getCenter(); // Distances are symmetric, hence d(i,j) == d(j,i) for (int j = i + 1; j < clusters.size(); j++) { double[] centerJ = clusters.get(j).getCenter(); minInterClusterDistance = Math.min(minInterClusterDistance, distanceFn.applyAsDouble(centerI, centerJ)); } } return minInterClusterDistance / maxIntraClusterDistance; }
private static JavaRDD<SAMRecord> alignmentsToSAM(JavaRDD<String> alignmentRDD, SAMFileHeader header) { return alignmentRDD.mapPartitions(alns -> { List<SAMRecord> records = new ArrayList<SAMRecord>(); final SAMLineParser samLP = new SAMLineParser(new DefaultSAMRecordFactory(), ValidationStringency.SILENT, header, null, null); while (alns.hasNext()) { String aln = alns.next().replace("\r\n", "").replace("\n", "").replace(System.lineSeparator(), ""); SAMRecord record = null; try{ record = samLP.parseLine(aln); records.add(record); }catch(SAMFormatException e){ System.out.println(e.getMessage().toString()); } } return records.iterator(); }); }
public static JavaPairRDD<SAMRecord, SAMRecordWritable> readsToWritable(JavaRDD<SAMRecord> records, Broadcast<SAMFileHeader> header) { return records.mapToPair(read -> { //SEQUENCE DICTIONARY must be set here for the alignment because it's not given as header file //Set in alignment to sam map phase if(header.getValue().getSequenceDictionary()==null) header.getValue().setSequenceDictionary(new SAMSequenceDictionary()); if(header.getValue().getSequenceDictionary().getSequence(read.getReferenceName())==null) header.getValue().getSequenceDictionary().addSequence(new SAMSequenceRecord(read.getReferenceName())); //read.setHeader(read.getHeader()); read.setHeaderStrict(header.getValue()); final SAMRecordWritable samRecordWritable = new SAMRecordWritable(); samRecordWritable.set(read); return new Tuple2<>(read, samRecordWritable); }); }
/** * Return an RDD with keys: label objects, and values: entity ids from a single collection, having this label * @param inputTriples * @param labelAtts * @param entityIds * @param SEPARATOR * @param positiveIds * @return */ private JavaPairRDD<String,Integer> getLabelBlocks(JavaRDD<String> inputTriples, Set<String> labelAtts, JavaRDD<String> entityIds, String SEPARATOR, boolean positiveIds) { Object2IntOpenHashMap<String> urls1 = Utils.readEntityIdsMapping(entityIds, positiveIds); return inputTriples.mapToPair(line -> { String[] spo = line.toLowerCase().replaceAll(" \\.$", "").split(SEPARATOR); //lose the ending " ." from valid .nt files if (spo.length < 3) { return null; } if (labelAtts.contains(spo[1])) { String labelValue = line.substring(line.indexOf(spo[1])+spo[1].length()+SEPARATOR.length()) .toLowerCase().replaceAll("[^a-z0-9 ]", "").trim(); int subjectId = urls1.getInt(Utils.encodeURIinUTF8(spo[0])); //replace subject url with entity id if (!positiveIds) { subjectId = -subjectId; } return new Tuple2<String,Integer>(labelValue,subjectId); } else { return null; } }) .filter(x-> x!= null) .distinct(); }
public static void main(String[] args) throws Exception { System.out.println(System.getProperty("hadoop.home.dir")); String inputPath = args[0]; String outputPath = args[1]; FileUtils.deleteQuietly(new File(outputPath)); JavaSparkContext sc = new JavaSparkContext("local", "sparkwordcount"); JavaRDD<String> rdd = sc.textFile(inputPath); JavaPairRDD<String, Integer> counts = rdd .flatMap(x -> Arrays.asList(x.split(" ")).iterator()) .mapToPair(x -> new Tuple2<String, Integer>((String) x, 1)) .reduceByKey((x, y) -> x + y); counts.saveAsTextFile(outputPath); sc.close(); }
/** * Implementation which splits based solely on time. It will return approximately * the earliest {@link #getTestFraction()} of input, ordered by timestamp, as new training * data and the rest as test data. */ @Override protected Pair<JavaRDD<String>,JavaRDD<String>> splitNewDataToTrainTest(JavaRDD<String> newData) { // Rough approximation; assumes timestamps are fairly evenly distributed StatCounter maxMin = newData.mapToDouble(line -> MLFunctions.TO_TIMESTAMP_FN.call(line).doubleValue()).stats(); long minTime = (long) maxMin.min(); long maxTime = (long) maxMin.max(); log.info("New data timestamp range: {} - {}", minTime, maxTime); long approxTestTrainBoundary = (long) (maxTime - getTestFraction() * (maxTime - minTime)); log.info("Splitting at timestamp {}", approxTestTrainBoundary); JavaRDD<String> newTrainData = newData.filter( line -> MLFunctions.TO_TIMESTAMP_FN.call(line) < approxTestTrainBoundary); JavaRDD<String> testData = newData.filter( line -> MLFunctions.TO_TIMESTAMP_FN.call(line) >= approxTestTrainBoundary); return new Pair<>(newTrainData, testData); }
@Override public RDD<Row> buildScan() { log.debug("-> buildScan()"); schema(); // I have isolated the work to a method to keep the plumbing code as simple as // possible. List<PhotoMetadata> table = collectData(); @SuppressWarnings("resource") JavaSparkContext sparkContext = new JavaSparkContext(sqlContext.sparkContext()); JavaRDD<Row> rowRDD = sparkContext.parallelize(table) .map(photo -> SparkBeanUtils.getRowFromBean(schema, photo)); return rowRDD.rdd(); }
private JavaPairRDD<String, String> parallizeData(SparkDriver spark, List<Tuple2<String, String>> datasetContent) { JavaRDD<Tuple2<String, String>> datasetContentRDD = spark.sc.parallelize(datasetContent); return datasetContentRDD.mapToPair(new PairFunction<Tuple2<String, String>, String, String>() { /** * */ private static final long serialVersionUID = 1L; @Override public Tuple2<String, String> call(Tuple2<String, String> term) throws Exception { return term; } }); }
/** * getAllWordsInDoc: Extracted all unique terms from all docs. * * @param docwordRDD Pair RDD, each key is a doc, and value is term list extracted from * that doc. * @return unique term list */ public static JavaRDD<String> getAllWordsInDoc(JavaPairRDD<String, List<String>> docwordRDD) { JavaRDD<String> wordRDD = docwordRDD.values().flatMap(new FlatMapFunction<List<String>, String>() { /** * */ private static final long serialVersionUID = 1L; @Override public Iterator<String> call(List<String> list) { return list.iterator(); } }).distinct(); return wordRDD; }
private Pair<JavaRDD<M>,JavaRDD<M>> splitTrainTest(JavaRDD<M> newData, JavaRDD<M> pastData) { Objects.requireNonNull(newData); if (testFraction <= 0.0) { return new Pair<>(pastData == null ? newData : newData.union(pastData), null); } if (testFraction >= 1.0) { return new Pair<>(pastData, newData); } if (empty(newData)) { return new Pair<>(pastData, null); } Pair<JavaRDD<M>,JavaRDD<M>> newTrainTest = splitNewDataToTrainTest(newData); JavaRDD<M> newTrainData = newTrainTest.getFirst(); return new Pair<>(pastData == null ? newTrainData : newTrainData.union(pastData), newTrainTest.getSecond()); }
/** * loadClickStremFromTxt:Load click stream form txt file * * @param clickthroughFile * txt file * @param sc * the spark context * @return clickstream list in JavaRDD format {@link ClickStream} */ public JavaRDD<ClickStream> loadClickStremFromTxt(String clickthroughFile, JavaSparkContext sc) { return sc.textFile(clickthroughFile).flatMap(new FlatMapFunction<String, ClickStream>() { /** * */ private static final long serialVersionUID = 1L; @SuppressWarnings("unchecked") @Override public Iterator<ClickStream> call(String line) throws Exception { List<ClickStream> clickthroughs = (List<ClickStream>) ClickStream.parseFromTextLine(line); return (Iterator<ClickStream>) clickthroughs; } }); }
public void combineShortSessionsInParallel(int timeThres) throws InterruptedException, IOException { JavaRDD<String> userRDD = getUserRDD(this.cleanupType); userRDD.foreachPartition(new VoidFunction<Iterator<String>>() { /** * */ private static final long serialVersionUID = 1L; @Override public void call(Iterator<String> arg0) throws Exception { ESDriver tmpES = new ESDriver(props); tmpES.createBulkProcessor(); while (arg0.hasNext()) { String s = arg0.next(); combineShortSessions(tmpES, s, timeThres); } tmpES.destroyBulkProcessor(); tmpES.close(); } }); }
private Map<Integer,Collection<String>> getDistinctValues(JavaRDD<String[]> parsedRDD) { int[] categoricalIndices = IntStream.range(0, inputSchema.getNumFeatures()). filter(inputSchema::isCategorical).toArray(); return parsedRDD.mapPartitions(data -> { Map<Integer,Collection<String>> categoryValues = new HashMap<>(); for (int i : categoricalIndices) { categoryValues.put(i, new HashSet<>()); } data.forEachRemaining(datum -> categoryValues.forEach((category, values) -> values.add(datum[category])) ); return Collections.singleton(categoryValues).iterator(); }).reduce((v1, v2) -> { // Assumes both have the same key set v1.forEach((category, values) -> values.addAll(v2.get(category))); return v1; }); }
protected void checkUserPartition(JavaRDD<String> userRDD) { System.out.println("hhhhh"); List<Partition> partitios = userRDD.partitions(); System.out.println(partitios.size()); int[] partitionIds = new int[partitios.size()]; for (int i = 0; i < partitios.size(); i++) { int index = partitios.get(i).index(); partitionIds[i] = index; } List<String>[] userIPs = userRDD.collectPartitions(partitionIds); for (int i = 0; i < userIPs.length; i++) { List<String> iuser = userIPs[i]; System.out.println(i + " partition"); System.out.println(iuser.toString()); } }
/** * GetSVDMatrix: Create SVD matrix csv file from original csv file. * * @param csvFileName each row is a term, and each column is a document. * @param svdDimention Dimension of SVD matrix * @param svdMatrixFileName CSV file name of SVD matrix */ public void getSVDMatrix(String csvFileName, int svdDimention, String svdMatrixFileName) { JavaPairRDD<String, Vector> importRDD = MatrixUtil.loadVectorFromCSV(spark, csvFileName, 1); JavaRDD<Vector> vectorRDD = importRDD.values(); RowMatrix wordDocMatrix = new RowMatrix(vectorRDD.rdd()); RowMatrix tfidfMatrix = MatrixUtil.createTFIDFMatrix(wordDocMatrix); RowMatrix svdMatrix = MatrixUtil.buildSVDMatrix(tfidfMatrix, svdDimention); List<String> rowKeys = importRDD.keys().collect(); List<String> colKeys = new ArrayList<>(); for (int i = 0; i < svdDimention; i++) { colKeys.add("dimension" + i); } MatrixUtil.exportToCSV(svdMatrix, rowKeys, colKeys, svdMatrixFileName); }
/** * * @param topKvalueCandidates the topK results per entity, acquired from value similarity * @param rawTriples1 the rdf triples of the first entity collection * @param rawTriples2 the rdf triples of the second entity collection * @param SEPARATOR the delimiter that separates subjects, predicates and objects in the rawTriples1 and rawTriples2 files * @param entityIds1 the mapping of entity urls to entity ids, as it was used in blocking * @param entityIds2 * @param MIN_SUPPORT_THRESHOLD the minimum support threshold, below which, relations are discarded from top relations * @param K the K for topK candidate matches * @param N the N for topN rdf relations (and neighbors) * @param jsc the java spark context used to load files and broadcast variables * @return topK neighbor candidates per entity */ public JavaPairRDD<Integer, IntArrayList> run(JavaPairRDD<Integer,Int2FloatLinkedOpenHashMap> topKvalueCandidates, JavaRDD<String> rawTriples1, JavaRDD<String> rawTriples2, String SEPARATOR, JavaRDD<String> entityIds1, JavaRDD<String> entityIds2, float MIN_SUPPORT_THRESHOLD, int K, int N, JavaSparkContext jsc) { Map<Integer,IntArrayList> inNeighbors = new HashMap<>(new RelationsRank().run(rawTriples1, SEPARATOR, entityIds1, MIN_SUPPORT_THRESHOLD, N, true, jsc)); inNeighbors.putAll(new RelationsRank().run(rawTriples2, SEPARATOR, entityIds2, MIN_SUPPORT_THRESHOLD, N, false, jsc)); Broadcast<Map<Integer,IntArrayList>> inNeighbors_BV = jsc.broadcast(inNeighbors); JavaPairRDD<Integer, IntArrayList> topKneighborCandidates = getTopKNeighborSimsSUM(topKvalueCandidates, inNeighbors_BV, K); return topKneighborCandidates; }
@Test public void testSmallAmountOfData(){ MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder() .updater(Updater.RMSPROP) .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT).iterations(1) .list() .layer(0, new org.deeplearning4j.nn.conf.layers.DenseLayer.Builder() .nIn(nIn).nOut(3) .activation("tanh").build()) .layer(1, new org.deeplearning4j.nn.conf.layers.OutputLayer.Builder(LossFunctions.LossFunction.MSE) .nIn(3).nOut(nOut) .activation("softmax") .build()) .build(); SparkDl4jMultiLayer sparkNet = new SparkDl4jMultiLayer(sc,conf,new ParameterAveragingTrainingMaster(true,numExecutors(),1,10,1,0)); Nd4j.getRandom().setSeed(12345); DataSet d1 = new DataSet(Nd4j.rand(1,nIn),Nd4j.rand(1,nOut)); DataSet d2 = new DataSet(Nd4j.rand(1,nIn),Nd4j.rand(1,nOut)); JavaRDD<DataSet> rddData = sc.parallelize(Arrays.asList(d1,d2)); sparkNet.fit(rddData); }
private static CoordinateMatrix GetLU_COORD(CoordinateMatrix A) { JavaRDD<MatrixEntry> rows = A.entries().toJavaRDD().cache(); JavaRDD<MatrixEntry> LUEntries = rows.mapPartitions(new FlatMapFunction<Iterator<MatrixEntry>, MatrixEntry>() { @Override public Iterator<MatrixEntry> call(Iterator<MatrixEntry> matrixEntryIterator) throws Exception { List<MatrixEntry> newLowerEntries = new ArrayList<MatrixEntry>(); while(matrixEntryIterator.hasNext()) { MatrixEntry currentEntry = matrixEntryIterator.next(); if(currentEntry.i() != currentEntry.j()) { newLowerEntries.add(currentEntry); } else { newLowerEntries.add(new MatrixEntry(currentEntry.i(), currentEntry.j(), 0.0)); } } return newLowerEntries.iterator(); } }); CoordinateMatrix newMatrix = new CoordinateMatrix(LUEntries.rdd()); return newMatrix; }
public static TrainedModel create(JavaRDD<Rating> trainingRdd, JavaRDD<Rating> testRdd, int rank, int iterationsNr) { logger.info(String.format("Train with parameters -> iterations: %d, rank :%d", iterationsNr, rank)); JavaRDD<Tuple2<Object, Object>> testForPredict = testRdd.map(rating -> new Tuple2<>(rating.user(), rating.product()) ); TimeKeeper timeKeeper = new TimeKeeper(); timeKeeper.start(); MatrixFactorizationModel model = ALS.train(JavaRDD.toRDD(trainingRdd), rank, iterationsNr, 0.1); timeKeeper.end().print(logger, "als model trained in ").reset(); Double error = getError(testRdd, rank, iterationsNr, testForPredict, timeKeeper, model); return new TrainedModel(error, model); }
/** * Loads in historical data and stores in Hazelcast IMDG. This is mostly to * provide a source of horses for the bet simulation. * * @throws IOException */ public void loadHistoricalRaces() throws IOException { filePath = Utils.unpackDataToTmp("historical_races.json"); final JavaRDD<String> eventsText = sc.textFile(filePath.toString()); final JavaRDD<Event> events = eventsText.map(s -> JSONSerializable.parse(s, Event::parseBag)); final JavaPairRDD<Horse, Integer> winners = events.mapToPair(e -> new Tuple2<>(e.getRaces().get(0).getWinner().orElse(Horse.PALE), 1)) .reduceByKey((a, b) -> a + b); final HazelcastRDDFunctions accessToHC = javaPairRddFunctions(winners); accessToHC.saveToHazelcastMap("winners"); }
private static void splitFastq(FileStatus fst, String fqPath, String splitDir, int splitlen, JavaSparkContext sc) throws IOException { Path fqpath = new Path(fqPath); String fqname = fqpath.getName(); String[] ns = fqname.split("\\."); List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen); JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif); splitRDD.foreach( split -> { FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), split); writeFastqFile(fqreader, new Configuration(), splitDir + "/split_" + split.getStart() + "." + ns[1]); }); }
private TrainedModel createAlsModel(JavaRDD<Rating> ratings, TrainConfig trainConfig) { double[] weights = {8, 2}; JavaRDD<Rating>[] randomRatings = ratings.randomSplit(weights, 0L); return ModelFactory.create(randomRatings[0], randomRatings[1], trainConfig.getRankNr(), trainConfig.getIterationsNr() ); }
/** * Get quads that specify type of a resource * * @param quads RDD of quads to filter * @param typePredicateURIs additional type predicates to use together with rdf:type * @return RDD of quads that specify type of a resource */ public static JavaRDD<Quad> filterTypeQuads(JavaRDD<Quad> quads, List<String> typePredicateURIs) { String typePredicateURI = RDF.TYPE.toString(); return quads.filter(quad -> { if (!quad.getPredicate().isURI() || !quad.getObject().isURI()) { return false; } String uri = quad.getPredicate().getURI(); return uri.equals(typePredicateURI) || typePredicateURIs.contains(uri); }); }
private static JavaRDD<String> dfToTabDelimited(Dataset<Row> df) { return df.toJavaRDD().map(row -> { //qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore String output = row.getAs("qseqid")+"\t"+row.getAs("sseqid")+"\t"+row.getAs("pident")+"\t" +row.getAs("length") +"\t"+row.getAs("mismatch")+"\t"+row.getAs("gapopen") +"\t"+row.getAs("qstart")+"\t"+row.getAs("qend")+"\t"+row.getAs("sstart") +"\t"+row.getAs("send")+"\t"+row.getAs("evalue")+"\t"+row.getAs("bitscore"); return output; }); }
private static JavaRDD<String> dfToCSV(Dataset<Row> df) { return df.toJavaRDD().map(row -> { //qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore String output = row.getAs("qseqid")+","+row.getAs("sseqid")+","+row.getAs("pident")+"," +row.getAs("length") +","+row.getAs("mismatch")+","+row.getAs("gapopen") +","+row.getAs("qstart")+","+row.getAs("qend")+","+row.getAs("sstart") +","+row.getAs("send")+","+row.getAs("evalue")+","+row.getAs("bitscore"); return output; }); }
/** * Returns the pairs that follow the following conditions: * 1. the entities of this pair have the same label * 2. only this pair of entities has this label * @param inputTriples1 * @param inputTriples2 * @param entityIds1 * @param entityIds2 * @param SEPARATOR * @return */ public JavaPairRDD<Integer,Integer> getMatchesFromLabels(JavaRDD<String> inputTriples1, JavaRDD<String> inputTriples2, JavaRDD<String> entityIds1, JavaRDD<String> entityIds2, String SEPARATOR, Set<String> labelAtts1, Set<String> labelAtts2) { JavaPairRDD<String,Integer> labelBlocks1 = getLabelBlocks(inputTriples1, labelAtts1, entityIds1, SEPARATOR, true); JavaPairRDD<String,Integer> labelBlocks2 = getLabelBlocks(inputTriples2, labelAtts2, entityIds2, SEPARATOR, false); return labelBlocks2.join(labelBlocks1) //get blocks from labels existing in both collections (inner join) (first D2, to keep negative ids first) .reduceByKey((x,y) -> x != null && x.equals(y) ? x : null) //if the block has more than two (one pair of) entities, skip this block .filter(x-> x._2() != null) .mapToPair(pair -> new Tuple2<>(pair._2()._1(), pair._2()._2())) .distinct() .reduceByKey((x,y) -> null) //if the entity is matched to more than one entities, skip this entity (this could happen when this entity has > 1 labels, one of them same with one entity, and the other same with another entity) .filter(x-> x._2() != null); }
/** * Returns an RDD of bundles loaded from the given path. * * @param spark the spark session * @param path a path to a directory of FHIR Bundles * @param minPartitions a suggested value for the minimal number of partitions * @return an RDD of FHIR Bundles */ public static JavaRDD<Bundle> loadFromDirectory(SparkSession spark, String path, int minPartitions) { return spark.sparkContext() .wholeTextFiles(path, minPartitions) .toJavaRDD() .map(new ToBundle()); }
public static JavaPairRDD<SAMRecord, SAMRecordWritable> readsToWritableNoHeader(JavaRDD<SAMRecord> records) { return records.mapToPair(read -> { final SAMRecordWritable samRecordWritable = new SAMRecordWritable(); samRecordWritable.set(read); return new Tuple2<>(read, samRecordWritable); }); }