Java 类org.apache.spark.api.java.JavaRDD 实例源码

项目：ViraPipe 文件：InterleaveMulti.java

public static void interleaveSplitFastq(FileStatus fst, FileStatus fst2, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {

    List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);
    List<FileSplit> nlif2 = NLineInputFormat.getSplitsForFile(fst2, sc.hadoopConfiguration(), splitlen);

    JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);
    JavaRDD<FileSplit> splitRDD2 = sc.parallelize(nlif2);
    JavaPairRDD<FileSplit, FileSplit> zips = splitRDD.zip(splitRDD2);

    zips.foreach( splits ->  {
      Path path = splits._1.getPath();
      FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), splits._1);
      FastqRecordReader fqreader2 = new FastqRecordReader(new Configuration(), splits._2);
      writeInterleavedSplits(fqreader, fqreader2, new Configuration(), splitDir+"/"+path.getParent().getName()+"_"+splits._1.getStart()+".fq");
    });
  }

项目：gcp 文件：Spark2Streaming.java

public static void main(String[] args) throws InterruptedException {
  SparkConf sc = new SparkConf().setAppName("POC-Streaming");
  try(JavaStreamingContext jsc = new JavaStreamingContext(sc, new Duration(2000))) {
    //JavaDStream<SampleXML> records = jsc.textFileStream("input/").map(new ParseXML());
    //textFileStream process lines of files, so xml has to be 1 line to work //alternative below

    JavaRDD<String> files = jsc.sparkContext().wholeTextFiles("input/").map(tuple -> tuple._2());
    Queue<JavaRDD<String>> rddQueue = new LinkedList<>();
    rddQueue.add(files);
    JavaDStream<String> records = jsc.queueStream(rddQueue);

    records.foreachRDD(rdd -> System.out.printf("Amount of XMLs: %d\n", rdd.count()));

    jsc.start();
    jsc.awaitTermination();
  }
}

项目：oryx2 文件：KMeansUpdate.java

/**
 * @param sparkContext    active Spark Context
 * @param trainData       training data on which to build a model
 * @param hyperParameters ordered list of hyper parameter values to use in building model
 * @param candidatePath   directory where additional model files can be written
 * @return a {@link PMML} representation of a model trained on the given data
 */
@Override
public PMML buildModel(JavaSparkContext sparkContext,
                       JavaRDD<String> trainData,
                       List<?> hyperParameters,
                       Path candidatePath) {
  int numClusters = (Integer) hyperParameters.get(0);
  Preconditions.checkArgument(numClusters > 1);
  log.info("Building KMeans Model with {} clusters", numClusters);

  JavaRDD<Vector> trainingData = parsedToVectorRDD(trainData.map(MLFunctions.PARSE_FN));
  KMeansModel kMeansModel = KMeans.train(trainingData.rdd(), numClusters, maxIterations,
                                         numberOfRuns, initializationStrategy);

  return kMeansModelToPMML(kMeansModel, fetchClusterCountsFromModel(trainingData, kMeansModel));
}

项目：movie-recommender 文件：ModelFinder.java

public TrainedModel findBestModel(JavaRDD<Rating> ratings) {
    double weights[] = {6, 2, 2};
    JavaRDD<Rating>[] randomRatings = ratings.randomSplit(weights, 0L);
    JavaRDD<Rating> trainingRdd = randomRatings[0];
    JavaRDD<Rating> validationRdd = randomRatings[1];
    JavaRDD<Rating> testRdd = randomRatings[2];
    TrainConfig trainConfig = findBestTrainingParameters(trainingRdd, validationRdd);

    TrainedModel model = ModelFactory.create(trainingRdd, testRdd, trainConfig.getRankNr(),
            trainConfig.getIterationsNr());
    logger.info("best model have RMSE = " + model.getError());

    return model;
}

项目：MinoanER 文件：BlockFilteringAdvanced.java

public JavaPairRDD<Integer,IntArrayList> parseBlockCollection(JavaRDD<String> blockingInput) {
    System.out.println("Parsing the blocking collection...");
    return blockingInput
        .map(line -> line.split("\t")) //split to [blockId, [entityIds]]
        .filter(line -> line.length == 2) //only keep lines of this format
        .mapToPair(pair -> {                
            int blockId = Integer.parseInt(pair[0]);
            String[] entities = pair[1].replaceFirst(";", "").split("#");
            if (entities == null || entities.length == 0) {
                return null;
            }
            List<Integer> outputEntities = new ArrayList<>(); //possible (but not really probable) cause of OOM (memory errors) if huge blocks exist
            for (String entity : entities) {
                if (entity.isEmpty()) continue; //in case the last entityId finishes with '#'
                Integer entityId = Integer.parseInt(entity);                                
                outputEntities.add(entityId);
            }
            return new Tuple2<>(blockId, new IntArrayList(outputEntities.stream().mapToInt(i->i).toArray()));
        })
        .filter(x -> x != null);
}

项目：ViraPipe 文件：InterleaveMulti.java

private static void splitFastq(FileStatus fst, String fqPath, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {
  Path fqpath = new Path(fqPath);
  String fqname = fqpath.getName();
  String[] ns = fqname.split("\\.");
  //TODO: Handle also compressed files
  List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);

  JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);

  splitRDD.foreach( split ->  {

    FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), split);
    writeFastqFile(fqreader, new Configuration(), splitDir + "/split_" + split.getStart() + "." + ns[1]);

   });
}

项目：ViraPipe 文件：NormalizeRDD.java

private static JavaRDD<String> getUniqueKmers(JavaPairRDD<Text, SequencedFragment> fastqRDD, int k) {
  JavaRDD<String> rdd = fastqRDD.mapPartitions(records -> {

    HashSet<String> umer_set = new HashSet<String>();
    while (records.hasNext()) {
      Tuple2<Text, SequencedFragment> fastq = records.next();
      String seq = fastq._2.getSequence().toString();
      //HashSet<String> umer_in_seq = new HashSet<String>();
      for (int i = 0; i < seq.length() - k - 1; i++) {
        String kmer = seq.substring(i, i + k);
        umer_set.add(kmer);
      }
    }
    return umer_set.iterator();
  });

  JavaRDD<String> umersRDD = rdd.distinct();
  //umersRDD.sortBy(s -> s, true, 4);
  return umersRDD;
}

项目：ViraPipe 文件：SamToFastq.java

private static JavaPairRDD<Text, SequencedFragment> mapSAMRecordsToFastq(JavaRDD<SAMRecord> bamRDD) {

    JavaPairRDD<Text, SequencedFragment> fastqRDD = bamRDD.mapToPair(read -> {

      String name = read.getReadName();
      if(read.getReadPairedFlag()){
        if(read.getFirstOfPairFlag())
          name = name+"/1";
        if(read.getSecondOfPairFlag())
          name = name+"/2";
      }

      //TODO: check values
      Text t = new Text(name);
      SequencedFragment sf = new SequencedFragment();
      sf.setSequence(new Text(read.getReadString()));
      sf.setQuality(new Text(read.getBaseQualityString()));

      return new Tuple2<Text, SequencedFragment>(t, sf);
    });
    return fastqRDD;
  }

项目：rdf2x 文件：InstanceFilterTest.java

@Test
public void testFilterWithoutIgnoreTypes() {
    InstanceFilterConfig config = new InstanceFilterConfig()
            .setIgnoreOtherTypes(false)
            .setTypes(Arrays.asList("http://example.com/a/type", "http://example.com/b/type"));
    InstanceFilter filter = new InstanceFilter(config);
    JavaRDD<Instance> result = filter.filter(testRDD, typeIndex);

    List<Instance> rdd = new ArrayList<>();

    Instance one = new Instance();
    one.addType(typeIndex.getIndex("http://example.com/a/type"));
    one.setUri("http://example.com/one");
    rdd.add(one);

    Instance two = new Instance();
    two.addType(typeIndex.getIndex("http://example.com/b/type"));
    two.addType(typeIndex.getIndex("http://example.com/c/type"));
    two.setUri("http://example.com/two");
    rdd.add(two);

    JavaRDD<Instance> expected = jsc().parallelize(rdd);

    assertRDDEquals("Expected instances without restricted types were filtered out", expected, result);
}

项目：MinoanER 文件：Utils.java

/**
 * @deprecated use {@link #readEntityIdsMapping(JavaRDD)} instead, to get the entity mappings used in blocking
 * Maps an entity url to its entity id, that is also used by blocking.
 * @param rawTriples
 * @param SEPARATOR
 * @return a map from an entity url to its entity id, that is also used by blocking.
 */
public static Object2IntOpenHashMap<String> getEntityIdsMapping(JavaRDD<String> rawTriples, String SEPARATOR) {        
    LinkedHashSet<String> subjectsSet =                  
        new LinkedHashSet<>(rawTriples
        .map(line -> line.split(SEPARATOR)[0])
        .collect()                
        ); //convert list to set (to remove duplicates)

    Object2IntOpenHashMap<String> result = new Object2IntOpenHashMap<>(subjectsSet.size());
    result.defaultReturnValue(-1);
    int index = 0;
    for (String subject : subjectsSet) {
        result.put(subject, index++);
    }
    return result;
}

项目：bunsen 文件：ConceptMaps.java

/**
 * Returns the latest versions of a given set of concept maps.
 *
 * @param urls a set of URLs to retrieve the latest version for, or null to load them all.
 * @param includeExperimental flag to include concept maps marked as experimental
 *
 * @return a map of concept map URLs to the latest version for them.
 */
public Map<String,String> getLatestVersions(final Set<String> urls,
    boolean includeExperimental) {

  // Reduce by the concept map URI to return only the latest version
  // per concept map. Spark's provided max aggregation function
  // only works on numeric types, so we jump into RDDs and perform
  // the reduce by hand.
  JavaRDD<UrlAndVersion> changes = this.conceptMaps.select(col("url"),
      col("version"),
      col("experimental"))
      .toJavaRDD()
      .filter(row -> (urls == null || urls.contains(row.getString(0)))
          && (includeExperimental || row.isNullAt(2) || !row.getBoolean(2)))
      .mapToPair(row -> new Tuple2<>(row.getString(0), row.getString(1)))
      .reduceByKey((leftVersion, rightVersion) ->
          leftVersion.compareTo(rightVersion) > 0 ? leftVersion : rightVersion)
      .map(tuple -> new UrlAndVersion(tuple._1, tuple._2));

  return this.spark.createDataset(changes.rdd(), URL_AND_VERSION_ENCODER)
      .collectAsList()
      .stream()
      .collect(Collectors.toMap(UrlAndVersion::getUrl,
          UrlAndVersion::getVersion));
}

项目：oryx2 文件：DunnIndex.java

/**
 * @param evalData data for evaluation
 * @return the Dunn Index of a given clustering
 *  (https://en.wikipedia.org/wiki/Cluster_analysis#Internal_evaluation); higher is better
 */
@Override
double evaluate(JavaRDD<Vector> evalData) {
  // Intra-cluster distance is mean distance to centroid
  double maxIntraClusterDistance =
      fetchClusterMetrics(evalData).values().mapToDouble(ClusterMetric::getMeanDist).max();

  // Inter-cluster distance is distance between centroids
  double minInterClusterDistance = Double.POSITIVE_INFINITY;
  List<ClusterInfo> clusters = new ArrayList<>(getClustersByID().values());
  DistanceFn<double[]> distanceFn = getDistanceFn();
  for (int i = 0; i < clusters.size(); i++) {
    double[] centerI = clusters.get(i).getCenter();
    // Distances are symmetric, hence d(i,j) == d(j,i)
    for (int j = i + 1; j < clusters.size(); j++) {
      double[] centerJ = clusters.get(j).getCenter();
      minInterClusterDistance = Math.min(minInterClusterDistance, distanceFn.applyAsDouble(centerI, centerJ));
    }
  }

  return minInterClusterDistance / maxIntraClusterDistance;
}

项目：ViraPipe 文件：HDFSWriter.java

private static JavaRDD<SAMRecord> alignmentsToSAM(JavaRDD<String> alignmentRDD, SAMFileHeader header) {
    return alignmentRDD.mapPartitions(alns -> {

        List<SAMRecord> records = new ArrayList<SAMRecord>();

        final SAMLineParser samLP = new SAMLineParser(new DefaultSAMRecordFactory(), ValidationStringency.SILENT, header, null, null);
        while (alns.hasNext()) {

            String aln = alns.next().replace("\r\n", "").replace("\n", "").replace(System.lineSeparator(), "");
            SAMRecord record = null;
            try{
                record = samLP.parseLine(aln);
                records.add(record);
            }catch(SAMFormatException e){
                System.out.println(e.getMessage().toString());
            }
        }
        return records.iterator();
    });
}

项目：ViraPipe 文件：HDFSWriter.java

public static JavaPairRDD<SAMRecord, SAMRecordWritable> readsToWritable(JavaRDD<SAMRecord> records, Broadcast<SAMFileHeader> header) {
    return records.mapToPair(read -> {

        //SEQUENCE DICTIONARY must be set here for the alignment because it's not given as header file
        //Set in alignment to sam map phase
        if(header.getValue().getSequenceDictionary()==null) header.getValue().setSequenceDictionary(new SAMSequenceDictionary());
        if(header.getValue().getSequenceDictionary().getSequence(read.getReferenceName())==null)
            header.getValue().getSequenceDictionary().addSequence(new SAMSequenceRecord(read.getReferenceName()));

        //read.setHeader(read.getHeader());
        read.setHeaderStrict(header.getValue());
        final SAMRecordWritable samRecordWritable = new SAMRecordWritable();
        samRecordWritable.set(read);
        return new Tuple2<>(read, samRecordWritable);
    });
}

项目：MinoanER 文件：LabelMatchingHeuristic.java

/**
 * Return an RDD with keys: label objects, and values: entity ids from a single collection, having this label
 * @param inputTriples
 * @param labelAtts
 * @param entityIds
 * @param SEPARATOR
 * @param positiveIds
 * @return 
 */
private JavaPairRDD<String,Integer> getLabelBlocks(JavaRDD<String> inputTriples, Set<String> labelAtts, JavaRDD<String> entityIds, String SEPARATOR, boolean positiveIds) {
    Object2IntOpenHashMap<String> urls1 = Utils.readEntityIdsMapping(entityIds, positiveIds);
    return inputTriples.mapToPair(line -> {
    String[] spo = line.toLowerCase().replaceAll(" \\.$", "").split(SEPARATOR); //lose the ending " ." from valid .nt files
      if (spo.length < 3) {
          return null;
      }          
      if (labelAtts.contains(spo[1])) {
        String labelValue = line.substring(line.indexOf(spo[1])+spo[1].length()+SEPARATOR.length())
                .toLowerCase().replaceAll("[^a-z0-9 ]", "").trim();
        int subjectId = urls1.getInt(Utils.encodeURIinUTF8(spo[0])); //replace subject url with entity id
        if (!positiveIds) {
            subjectId = -subjectId;
        }
        return new Tuple2<String,Integer>(labelValue,subjectId);
      } else {
          return null;
      }          
    })
    .filter(x-> x!= null)
    .distinct();
}

项目：Apache-Spark-2x-for-Java-Developers 文件：SparkWordCount.java

public static void main(String[] args) throws Exception {
    System.out.println(System.getProperty("hadoop.home.dir"));
    String inputPath = args[0];
    String outputPath = args[1];
    FileUtils.deleteQuietly(new File(outputPath));

    JavaSparkContext sc = new JavaSparkContext("local", "sparkwordcount");

    JavaRDD<String> rdd = sc.textFile(inputPath);

    JavaPairRDD<String, Integer> counts = rdd
            .flatMap(x -> Arrays.asList(x.split(" ")).iterator())
            .mapToPair(x -> new Tuple2<String, Integer>((String) x, 1))
            .reduceByKey((x, y) -> x + y);

    counts.saveAsTextFile(outputPath);
    sc.close();
}

项目：oryx2 文件：ALSUpdate.java

/**
 * Implementation which splits based solely on time. It will return approximately
 * the earliest {@link #getTestFraction()} of input, ordered by timestamp, as new training
 * data and the rest as test data.
 */
@Override
protected Pair<JavaRDD<String>,JavaRDD<String>> splitNewDataToTrainTest(JavaRDD<String> newData) {
  // Rough approximation; assumes timestamps are fairly evenly distributed
  StatCounter maxMin = newData.mapToDouble(line -> MLFunctions.TO_TIMESTAMP_FN.call(line).doubleValue()).stats();

  long minTime = (long) maxMin.min();
  long maxTime = (long) maxMin.max();
  log.info("New data timestamp range: {} - {}", minTime, maxTime);
  long approxTestTrainBoundary = (long) (maxTime - getTestFraction() * (maxTime - minTime));
  log.info("Splitting at timestamp {}", approxTestTrainBoundary);

  JavaRDD<String> newTrainData = newData.filter(
      line -> MLFunctions.TO_TIMESTAMP_FN.call(line) < approxTestTrainBoundary);
  JavaRDD<String> testData = newData.filter(
      line -> MLFunctions.TO_TIMESTAMP_FN.call(line) >= approxTestTrainBoundary);

  return new Pair<>(newTrainData, testData);
}

项目：net.jgp.labs.spark.datasources 文件：ExifDirectoryRelation.java

@Override
public RDD<Row> buildScan() {
    log.debug("-> buildScan()");
    schema();

    // I have isolated the work to a method to keep the plumbing code as simple as
    // possible.
    List<PhotoMetadata> table = collectData();

    @SuppressWarnings("resource")
    JavaSparkContext sparkContext = new JavaSparkContext(sqlContext.sparkContext());
    JavaRDD<Row> rowRDD = sparkContext.parallelize(table)
            .map(photo -> SparkBeanUtils.getRowFromBean(schema, photo));

    return rowRDD.rdd();
}

项目：incubator-sdap-mudrod 文件：MetadataOpt.java

private JavaPairRDD<String, String> parallizeData(SparkDriver spark, List<Tuple2<String, String>> datasetContent) {

    JavaRDD<Tuple2<String, String>> datasetContentRDD = spark.sc.parallelize(datasetContent);

    return datasetContentRDD.mapToPair(new PairFunction<Tuple2<String, String>, String, String>() {
      /**
       * 
       */
      private static final long serialVersionUID = 1L;

      @Override
      public Tuple2<String, String> call(Tuple2<String, String> term) throws Exception {
        return term;
      }
    });

  }

项目：incubator-sdap-mudrod 文件：RDDUtil.java

/**
 * getAllWordsInDoc: Extracted all unique terms from all docs.
 *
 * @param docwordRDD Pair RDD, each key is a doc, and value is term list extracted from
 *                   that doc.
 * @return unique term list
 */
public static JavaRDD<String> getAllWordsInDoc(JavaPairRDD<String, List<String>> docwordRDD) {
  JavaRDD<String> wordRDD = docwordRDD.values().flatMap(new FlatMapFunction<List<String>, String>() {
    /**
     *
     */
    private static final long serialVersionUID = 1L;

    @Override
    public Iterator<String> call(List<String> list) {
      return list.iterator();
    }
  }).distinct();

  return wordRDD;
}

项目：oryx2 文件：MLUpdate.java

private Pair<JavaRDD<M>,JavaRDD<M>> splitTrainTest(JavaRDD<M> newData, JavaRDD<M> pastData) {
  Objects.requireNonNull(newData);
  if (testFraction <= 0.0) {
    return new Pair<>(pastData == null ? newData : newData.union(pastData), null);
  }
  if (testFraction >= 1.0) {
    return new Pair<>(pastData, newData);
  }
  if (empty(newData)) {
    return new Pair<>(pastData, null);
  }
  Pair<JavaRDD<M>,JavaRDD<M>> newTrainTest = splitNewDataToTrainTest(newData);
  JavaRDD<M> newTrainData = newTrainTest.getFirst();
  return new Pair<>(pastData == null ? newTrainData : newTrainData.union(pastData),
                    newTrainTest.getSecond());
}

项目：incubator-sdap-mudrod 文件：SessionExtractor.java

/**
 * loadClickStremFromTxt:Load click stream form txt file
 *
 * @param clickthroughFile
 *          txt file
 * @param sc
 *          the spark context
 * @return clickstream list in JavaRDD format {@link ClickStream}
 */
public JavaRDD<ClickStream> loadClickStremFromTxt(String clickthroughFile, JavaSparkContext sc) {
  return sc.textFile(clickthroughFile).flatMap(new FlatMapFunction<String, ClickStream>() {
    /**
     *
     */
    private static final long serialVersionUID = 1L;

    @SuppressWarnings("unchecked")
    @Override
    public Iterator<ClickStream> call(String line) throws Exception {
      List<ClickStream> clickthroughs = (List<ClickStream>) ClickStream.parseFromTextLine(line);
      return (Iterator<ClickStream>) clickthroughs;
    }
  });
}

项目：incubator-sdap-mudrod 文件：SessionGenerator.java

public void combineShortSessionsInParallel(int timeThres) throws InterruptedException, IOException {

    JavaRDD<String> userRDD = getUserRDD(this.cleanupType);

    userRDD.foreachPartition(new VoidFunction<Iterator<String>>() {
      /**
       *
       */
      private static final long serialVersionUID = 1L;

      @Override
      public void call(Iterator<String> arg0) throws Exception {
        ESDriver tmpES = new ESDriver(props);
        tmpES.createBulkProcessor();
        while (arg0.hasNext()) {
          String s = arg0.next();
          combineShortSessions(tmpES, s, timeThres);
        }
        tmpES.destroyBulkProcessor();
        tmpES.close();
      }
    });
  }

项目：oryx2 文件：RDFUpdate.java

private Map<Integer,Collection<String>> getDistinctValues(JavaRDD<String[]> parsedRDD) {
  int[] categoricalIndices = IntStream.range(0, inputSchema.getNumFeatures()).
      filter(inputSchema::isCategorical).toArray();

  return parsedRDD.mapPartitions(data -> {
      Map<Integer,Collection<String>> categoryValues = new HashMap<>();
      for (int i : categoricalIndices) {
        categoryValues.put(i, new HashSet<>());
      }
      data.forEachRemaining(datum ->
        categoryValues.forEach((category, values) -> values.add(datum[category]))
      );
      return Collections.singleton(categoryValues).iterator();
    }).reduce((v1, v2) -> {
      // Assumes both have the same key set
      v1.forEach((category, values) -> values.addAll(v2.get(category)));
      return v1;
    });
}

项目：incubator-sdap-mudrod 文件：LogAbstract.java

protected void checkUserPartition(JavaRDD<String> userRDD) {
  System.out.println("hhhhh");
  List<Partition> partitios = userRDD.partitions();
  System.out.println(partitios.size());
  int[] partitionIds = new int[partitios.size()];
  for (int i = 0; i < partitios.size(); i++) {
    int index = partitios.get(i).index();
    partitionIds[i] = index;
  }

  List<String>[] userIPs = userRDD.collectPartitions(partitionIds);
  for (int i = 0; i < userIPs.length; i++) {
    List<String> iuser = userIPs[i];
    System.out.println(i + " partition");
    System.out.println(iuser.toString());
  }
}

项目：incubator-sdap-mudrod 文件：SVDAnalyzer.java

/**
 * GetSVDMatrix: Create SVD matrix csv file from original csv file.
 *
 * @param csvFileName       each row is a term, and each column is a document.
 * @param svdDimention      Dimension of SVD matrix
 * @param svdMatrixFileName CSV file name of SVD matrix
 */
public void getSVDMatrix(String csvFileName, int svdDimention, String svdMatrixFileName) {

  JavaPairRDD<String, Vector> importRDD = MatrixUtil.loadVectorFromCSV(spark, csvFileName, 1);
  JavaRDD<Vector> vectorRDD = importRDD.values();
  RowMatrix wordDocMatrix = new RowMatrix(vectorRDD.rdd());
  RowMatrix tfidfMatrix = MatrixUtil.createTFIDFMatrix(wordDocMatrix);
  RowMatrix svdMatrix = MatrixUtil.buildSVDMatrix(tfidfMatrix, svdDimention);

  List<String> rowKeys = importRDD.keys().collect();
  List<String> colKeys = new ArrayList<>();
  for (int i = 0; i < svdDimention; i++) {
    colKeys.add("dimension" + i);
  }
  MatrixUtil.exportToCSV(svdMatrix, rowKeys, colKeys, svdMatrixFileName);
}

项目：MinoanER 文件：CNPARCS.java

/**
 * 
 * @param topKvalueCandidates the topK results per entity, acquired from value similarity
 * @param rawTriples1 the rdf triples of the first entity collection
 * @param rawTriples2 the rdf triples of the second entity collection
 * @param SEPARATOR the delimiter that separates subjects, predicates and objects in the rawTriples1 and rawTriples2 files
 * @param entityIds1 the mapping of entity urls to entity ids, as it was used in blocking
 * @param entityIds2
 * @param MIN_SUPPORT_THRESHOLD the minimum support threshold, below which, relations are discarded from top relations
 * @param K the K for topK candidate matches
 * @param N the N for topN rdf relations (and neighbors)
 * @param jsc the java spark context used to load files and broadcast variables
 * @return topK neighbor candidates per entity
 */
public JavaPairRDD<Integer, IntArrayList> run(JavaPairRDD<Integer,Int2FloatLinkedOpenHashMap> topKvalueCandidates, 
        JavaRDD<String> rawTriples1, 
        JavaRDD<String> rawTriples2,             
        String SEPARATOR, 
        JavaRDD<String> entityIds1, 
        JavaRDD<String> entityIds2, 
        float MIN_SUPPORT_THRESHOLD,
        int K,
        int N, 
        JavaSparkContext jsc) {

    Map<Integer,IntArrayList> inNeighbors = new HashMap<>(new RelationsRank().run(rawTriples1, SEPARATOR, entityIds1, MIN_SUPPORT_THRESHOLD, N, true, jsc));
    inNeighbors.putAll(new RelationsRank().run(rawTriples2, SEPARATOR, entityIds2, MIN_SUPPORT_THRESHOLD, N, false, jsc));

    Broadcast<Map<Integer,IntArrayList>> inNeighbors_BV = jsc.broadcast(inNeighbors);             
    JavaPairRDD<Integer, IntArrayList> topKneighborCandidates =  getTopKNeighborSimsSUM(topKvalueCandidates, inNeighbors_BV, K);        
    return topKneighborCandidates;
}

项目：Deep-Learning-with-Hadoop 文件：TestSparkMultiLayerParameterAveraging.java

@Test
public void testSmallAmountOfData(){


    MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder()
            .updater(Updater.RMSPROP)
            .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT).iterations(1)
            .list()
            .layer(0, new org.deeplearning4j.nn.conf.layers.DenseLayer.Builder()
                    .nIn(nIn).nOut(3)
                    .activation("tanh").build())
            .layer(1, new org.deeplearning4j.nn.conf.layers.OutputLayer.Builder(LossFunctions.LossFunction.MSE)
                    .nIn(3).nOut(nOut)
                    .activation("softmax")
                    .build())
            .build();

    SparkDl4jMultiLayer sparkNet = new SparkDl4jMultiLayer(sc,conf,new ParameterAveragingTrainingMaster(true,numExecutors(),1,10,1,0));

    Nd4j.getRandom().setSeed(12345);
    DataSet d1 = new DataSet(Nd4j.rand(1,nIn),Nd4j.rand(1,nOut));
    DataSet d2 = new DataSet(Nd4j.rand(1,nIn),Nd4j.rand(1,nOut));

    JavaRDD<DataSet> rddData = sc.parallelize(Arrays.asList(d1,d2));

    sparkNet.fit(rddData);

}

项目：BLASpark 文件：OtherOperations.java

private static CoordinateMatrix GetLU_COORD(CoordinateMatrix A) {

        JavaRDD<MatrixEntry> rows = A.entries().toJavaRDD().cache();

        JavaRDD<MatrixEntry> LUEntries = rows.mapPartitions(new FlatMapFunction<Iterator<MatrixEntry>, MatrixEntry>() {
            @Override
            public Iterator<MatrixEntry> call(Iterator<MatrixEntry> matrixEntryIterator) throws Exception {
                List<MatrixEntry> newLowerEntries = new ArrayList<MatrixEntry>();


                while(matrixEntryIterator.hasNext()) {
                    MatrixEntry currentEntry = matrixEntryIterator.next();

                    if(currentEntry.i() != currentEntry.j()) {
                        newLowerEntries.add(currentEntry);
                    }
                    else {
                        newLowerEntries.add(new MatrixEntry(currentEntry.i(), currentEntry.j(), 0.0));
                    }

                }

                return newLowerEntries.iterator();
            }
        });

        CoordinateMatrix newMatrix = new CoordinateMatrix(LUEntries.rdd());

        return newMatrix;
    }

项目：movie-recommender 文件：ModelFactory.java

public static TrainedModel create(JavaRDD<Rating> trainingRdd, JavaRDD<Rating> testRdd, int rank, int iterationsNr) {
    logger.info(String.format("Train with parameters -> iterations: %d, rank :%d", iterationsNr, rank));
    JavaRDD<Tuple2<Object, Object>> testForPredict = testRdd.map(rating ->
        new Tuple2<>(rating.user(), rating.product())
    );
    TimeKeeper timeKeeper = new TimeKeeper();
    timeKeeper.start();

    MatrixFactorizationModel model = ALS.train(JavaRDD.toRDD(trainingRdd), rank, iterationsNr, 0.1);
    timeKeeper.end().print(logger, "als model trained in ").reset();

    Double error = getError(testRdd, rank, iterationsNr, testForPredict, timeKeeper, model);

    return new TrainedModel(error, model);
}

项目：betleopard 文件：LiveBetMain.java

/**
 * Loads in historical data and stores in Hazelcast IMDG. This is mostly to 
 * provide a source of horses for the bet simulation.
 * 
 * @throws IOException 
 */
public void loadHistoricalRaces() throws IOException {
    filePath = Utils.unpackDataToTmp("historical_races.json");

    final JavaRDD<String> eventsText = sc.textFile(filePath.toString());
    final JavaRDD<Event> events
            = eventsText.map(s -> JSONSerializable.parse(s, Event::parseBag));

    final JavaPairRDD<Horse, Integer> winners
            = events.mapToPair(e -> new Tuple2<>(e.getRaces().get(0).getWinner().orElse(Horse.PALE), 1))
            .reduceByKey((a, b) -> a + b);

    final HazelcastRDDFunctions accessToHC = javaPairRddFunctions(winners);
    accessToHC.saveToHazelcastMap("winners");
}

项目：ViraPipe 文件：Decompress.java

private static void splitFastq(FileStatus fst, String fqPath, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {
  Path fqpath = new Path(fqPath);
  String fqname = fqpath.getName();
  String[] ns = fqname.split("\\.");
  List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);

  JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);

  splitRDD.foreach( split ->  {

    FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), split);
    writeFastqFile(fqreader, new Configuration(), splitDir + "/split_" + split.getStart() + "." + ns[1]);

   });
}

项目：movie-recommender 文件：RecommendationEngine.java

private TrainedModel createAlsModel(JavaRDD<Rating> ratings, TrainConfig trainConfig) {
    double[] weights = {8, 2};
    JavaRDD<Rating>[] randomRatings = ratings.randomSplit(weights, 0L);

    return ModelFactory.create(randomRatings[0],
            randomRatings[1],
            trainConfig.getRankNr(),
            trainConfig.getIterationsNr()
    );
}

项目：rdf2x 文件：QuadUtils.java

/**
 * Get quads that specify type of a resource
 *
 * @param quads             RDD of quads to filter
 * @param typePredicateURIs additional type predicates to use together with rdf:type
 * @return RDD of quads that specify type of a resource
 */
public static JavaRDD<Quad> filterTypeQuads(JavaRDD<Quad> quads, List<String> typePredicateURIs) {
    String typePredicateURI = RDF.TYPE.toString();

    return quads.filter(quad -> {
        if (!quad.getPredicate().isURI() || !quad.getObject().isURI()) {
            return false;
        }
        String uri = quad.getPredicate().getURI();
        return uri.equals(typePredicateURI) || typePredicateURIs.contains(uri);
    });
}

项目：ViraPipe 文件：SQLQueryBlast.java

private static JavaRDD<String> dfToTabDelimited(Dataset<Row> df) {
  return df.toJavaRDD().map(row ->  {
    //qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore

    String output = row.getAs("qseqid")+"\t"+row.getAs("sseqid")+"\t"+row.getAs("pident")+"\t"
            +row.getAs("length") +"\t"+row.getAs("mismatch")+"\t"+row.getAs("gapopen")
            +"\t"+row.getAs("qstart")+"\t"+row.getAs("qend")+"\t"+row.getAs("sstart")
            +"\t"+row.getAs("send")+"\t"+row.getAs("evalue")+"\t"+row.getAs("bitscore");

    return output;
  });
}

项目：ViraPipe 文件：SQLQueryBlast.java

private static JavaRDD<String> dfToCSV(Dataset<Row> df) {
  return df.toJavaRDD().map(row ->  {
    //qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore

    String output = row.getAs("qseqid")+","+row.getAs("sseqid")+","+row.getAs("pident")+","
            +row.getAs("length") +","+row.getAs("mismatch")+","+row.getAs("gapopen")
            +","+row.getAs("qstart")+","+row.getAs("qend")+","+row.getAs("sstart")
            +","+row.getAs("send")+","+row.getAs("evalue")+","+row.getAs("bitscore");

    return output;
  });
}

项目：ViraPipe 文件：Interleave.java

private static void splitFastq(FileStatus fst, String fqPath, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {
  Path fqpath = new Path(fqPath);
  String fqname = fqpath.getName();
  String[] ns = fqname.split("\\.");
  List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);

  JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);

  splitRDD.foreach( split ->  {

    FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), split);
    writeFastqFile(fqreader, new Configuration(), splitDir + "/split_" + split.getStart() + "." + ns[1]);

   });
}

项目：MinoanER 文件：LabelMatchingHeuristic.java

/**
 * Returns the pairs that follow the following conditions:
 * 1. the entities of this pair have the same label
 * 2. only this pair of entities has this label
 * @param inputTriples1
 * @param inputTriples2
 * @param entityIds1
 * @param entityIds2
 * @param SEPARATOR
 * @return 
 */
public JavaPairRDD<Integer,Integer> getMatchesFromLabels(JavaRDD<String> inputTriples1, JavaRDD<String> inputTriples2, JavaRDD<String> entityIds1, JavaRDD<String> entityIds2, String SEPARATOR, Set<String> labelAtts1, Set<String> labelAtts2) {                
    JavaPairRDD<String,Integer> labelBlocks1 = getLabelBlocks(inputTriples1, labelAtts1, entityIds1, SEPARATOR, true);
    JavaPairRDD<String,Integer> labelBlocks2 = getLabelBlocks(inputTriples2, labelAtts2, entityIds2, SEPARATOR, false);

    return labelBlocks2.join(labelBlocks1) //get blocks from labels existing in both collections (inner join) (first D2, to keep negative ids first)
            .reduceByKey((x,y) -> x != null && x.equals(y) ? x : null) //if the block has more than two (one pair of) entities, skip this block
            .filter(x-> x._2() != null)
            .mapToPair(pair -> new Tuple2<>(pair._2()._1(), pair._2()._2()))
            .distinct()
            .reduceByKey((x,y) -> null) //if the entity is matched to more than one entities, skip this entity (this could happen when this entity has > 1 labels, one of them same with one entity, and the other same with another entity)
            .filter(x-> x._2() != null);

}

项目：bunsen 文件：Bundles.java

/**
 * Returns an RDD of bundles loaded from the given path.
 *
 * @param spark the spark session
 * @param path a path to a directory of FHIR Bundles
 * @param minPartitions a suggested value for the minimal number of partitions
 * @return an RDD of FHIR Bundles
 */
public static JavaRDD<Bundle> loadFromDirectory(SparkSession spark,
    String path,
    int minPartitions) {

  return spark.sparkContext()
      .wholeTextFiles(path, minPartitions)
      .toJavaRDD()
      .map(new ToBundle());
}

项目：ViraPipe 文件：HDFSWriter.java

public static JavaPairRDD<SAMRecord, SAMRecordWritable> readsToWritableNoHeader(JavaRDD<SAMRecord> records) {
    return records.mapToPair(read -> {
        final SAMRecordWritable samRecordWritable = new SAMRecordWritable();
        samRecordWritable.set(read);
        return new Tuple2<>(read, samRecordWritable);
    });
}