Java 类org.apache.spark.api.java.JavaSparkContext 实例源码

项目:big-data-benchmark    文件:SparkWordCount.java   
public static void main(String[] args) {
    if (args.length != 2) {
        System.err.println("Usage:");
        System.err.println("  SparkWordCount <sourceFile> <targetFile>");
        System.exit(1);
    }

    SparkConf conf = new SparkConf()
            .setAppName("Word Count");
    JavaSparkContext sc = new JavaSparkContext(conf);
    JavaRDD<String> textFile = sc.textFile(args[0]);
    JavaRDD<String> words = textFile.flatMap(LineIterator::new);
    JavaPairRDD<String, Long> pairs =
            words.mapToPair(s -> new Tuple2<>(s, 1L));
    JavaPairRDD<String, Long> counts =
            pairs.reduceByKey((Function2<Long, Long, Long>) (a, b) -> a + b);

    System.out.println("Starting task..");
    long t = System.currentTimeMillis();
    counts.saveAsTextFile(args[1] + "_" + t);
    System.out.println("Time=" + (System.currentTimeMillis() - t));
}
项目:Sempala    文件:Spark.java   
/**
 * Initializes a Spark connection. Use it afterwards for execution of Spark
 * SQL queries.
 * 
 * @param appName
 *            the name of the app that will be used with this Spark
 *            connection
 * @param database
 *            name of the database that will be used with this Spark
 *            connection
 */
public Spark(String appName, String database) {

    // TODO check what will happen if there is already in use the same app
    // name
    this.sparkConfiguration = new SparkConf().setAppName(appName);
    this.javaContext = new JavaSparkContext(sparkConfiguration);
    this.hiveContext = new HiveContext(javaContext);
    // TODO check what kind of exception can be thrown here if there is a
    // problem with spark connection

    this.hiveContext.sql(String.format("CREATE DATABASE %s", database));
    // TODO check what kind of exception is thrown if database already

    // use the created database
    this.hiveContext.sql((String.format("USE %s", database)));
}
项目:oryx2    文件:KMeansUpdate.java   
/**
 * @param sparkContext    active Spark Context
 * @param trainData       training data on which to build a model
 * @param hyperParameters ordered list of hyper parameter values to use in building model
 * @param candidatePath   directory where additional model files can be written
 * @return a {@link PMML} representation of a model trained on the given data
 */
@Override
public PMML buildModel(JavaSparkContext sparkContext,
                       JavaRDD<String> trainData,
                       List<?> hyperParameters,
                       Path candidatePath) {
  int numClusters = (Integer) hyperParameters.get(0);
  Preconditions.checkArgument(numClusters > 1);
  log.info("Building KMeans Model with {} clusters", numClusters);

  JavaRDD<Vector> trainingData = parsedToVectorRDD(trainData.map(MLFunctions.PARSE_FN));
  KMeansModel kMeansModel = KMeans.train(trainingData.rdd(), numClusters, maxIterations,
                                         numberOfRuns, initializationStrategy);

  return kMeansModelToPMML(kMeansModel, fetchClusterCountsFromModel(trainingData, kMeansModel));
}
项目:BLASpark    文件:OtherOperations.java   
public static DistributedMatrix GetLU(DistributedMatrix A, JavaSparkContext jsc) {


        DistributedMatrix returnedMatrix;

        if( A.getClass() == IndexedRowMatrix.class) {
            returnedMatrix = OtherOperations.GetLU_IRW((IndexedRowMatrix) A);
        }
        else if (A.getClass() == CoordinateMatrix.class) {
            returnedMatrix = OtherOperations.GetLU_COORD((CoordinateMatrix) A);
        }
        else if (A.getClass() == BlockMatrix.class){
            // TODO: Implement this operation
            //returnedMatrices = OtherOperations.GetLU_BCK((BlockMatrix) A, diagonalInL, diagonalInU, jsc);
            returnedMatrix = null;
        }
        else {
            returnedMatrix = null;
        }


        return returnedMatrix;

    }
项目:BLASpark    文件:OtherOperations.java   
public static DistributedMatrix GetD(DistributedMatrix A, boolean inverseValues, JavaSparkContext jsc) {

        DistributedMatrix returnedMatrix;

        if( A.getClass() == IndexedRowMatrix.class) {
            returnedMatrix = OtherOperations.GetD_IRW((IndexedRowMatrix) A, inverseValues, jsc);
        }
        else if (A.getClass() == CoordinateMatrix.class) {
            returnedMatrix = OtherOperations.GetD_COORD((CoordinateMatrix) A, inverseValues, jsc);
        }
        else if (A.getClass() == BlockMatrix.class){
            // TODO: Implement this operation
            //returnedMatrices = OtherOperations.GetLU_BCK((BlockMatrix) A, diagonalInL, diagonalInU, jsc);
            returnedMatrix = null;
        }
        else {
            returnedMatrix = null;
        }


        return returnedMatrix;

    }
项目:ViraPipe    文件:InterleaveMulti.java   
private static void splitFastq(FileStatus fst, String fqPath, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {
  Path fqpath = new Path(fqPath);
  String fqname = fqpath.getName();
  String[] ns = fqname.split("\\.");
  //TODO: Handle also compressed files
  List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);

  JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);

  splitRDD.foreach( split ->  {

    FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), split);
    writeFastqFile(fqreader, new Configuration(), splitDir + "/split_" + split.getStart() + "." + ns[1]);

   });
}
项目:ViraPipe    文件:Decompress.java   
public static void interleaveSplitFastq(FileStatus fst, FileStatus fst2, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {

    List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);
    List<FileSplit> nlif2 = NLineInputFormat.getSplitsForFile(fst2, sc.hadoopConfiguration(), splitlen);

    JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);
    JavaRDD<FileSplit> splitRDD2 = sc.parallelize(nlif2);
    JavaPairRDD<FileSplit, FileSplit> zips = splitRDD.zip(splitRDD2);

    zips.foreach( splits ->  {
      Path path = splits._1.getPath();
      FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), splits._1);
      FastqRecordReader fqreader2 = new FastqRecordReader(new Configuration(), splits._2);

      writeInterleavedSplits(fqreader, fqreader2, new Configuration(), splitDir+"/"+path.getParent().getName()+"_"+splits._1.getStart()+".fq");
    });
  }
项目:Apache-Spark-2x-for-Java-Developers    文件:WordCount.java   
public static void wordCountJava8( String filename )
{
    // Define a configuration to use to interact with Spark
    SparkConf conf = new SparkConf().setMaster("local").setAppName("Work Count App");

    // Create a Java version of the Spark Context from the configuration
    JavaSparkContext sc = new JavaSparkContext(conf);

    // Load the input data, which is a text file read from the command line
    JavaRDD<String> input = sc.textFile( filename );

    // Java 8 with lambdas: split the input string into words
   // TODO here a change has happened 
    JavaRDD<String> words = input.flatMap( s -> Arrays.asList( s.split( " " ) ).iterator() );

    // Java 8 with lambdas: transform the collection of words into pairs (word and 1) and then count them
    JavaPairRDD<Object, Object> counts = words.mapToPair( t -> new Tuple2( t, 1 ) ).reduceByKey( (x, y) -> (int)x + (int)y );

    // Save the word count back out to a text file, causing evaluation.
    counts.saveAsTextFile( "output" );
}
项目:Apache-Spark-2x-for-Java-Developers    文件:SparkWordCount.java   
public static void main(String[] args) throws Exception {
    System.out.println(System.getProperty("hadoop.home.dir"));
    String inputPath = args[0];
    String outputPath = args[1];
    FileUtils.deleteQuietly(new File(outputPath));

    JavaSparkContext sc = new JavaSparkContext("local", "sparkwordcount");

    JavaRDD<String> rdd = sc.textFile(inputPath);

    JavaPairRDD<String, Integer> counts = rdd
            .flatMap(x -> Arrays.asList(x.split(" ")).iterator())
            .mapToPair(x -> new Tuple2<String, Integer>((String) x, 1))
            .reduceByKey((x, y) -> x + y);

    counts.saveAsTextFile(outputPath);
    sc.close();
}
项目:incubator-sdap-mudrod    文件:SessionExtractor.java   
/**
 * loadClickStremFromTxt:Load click stream form txt file
 *
 * @param clickthroughFile
 *          txt file
 * @param sc
 *          the spark context
 * @return clickstream list in JavaRDD format {@link ClickStream}
 */
public JavaRDD<ClickStream> loadClickStremFromTxt(String clickthroughFile, JavaSparkContext sc) {
  return sc.textFile(clickthroughFile).flatMap(new FlatMapFunction<String, ClickStream>() {
    /**
     *
     */
    private static final long serialVersionUID = 1L;

    @SuppressWarnings("unchecked")
    @Override
    public Iterator<ClickStream> call(String line) throws Exception {
      List<ClickStream> clickthroughs = (List<ClickStream>) ClickStream.parseFromTextLine(line);
      return (Iterator<ClickStream>) clickthroughs;
    }
  });
}
项目:incubator-sdap-mudrod    文件:SparkDriver.java   
public SparkDriver(Properties props) {
  SparkConf conf = new SparkConf().setAppName(props.getProperty(MudrodConstants.SPARK_APP_NAME, "MudrodSparkApp")).setIfMissing("spark.master", props.getProperty(MudrodConstants.SPARK_MASTER))
      .set("spark.hadoop.validateOutputSpecs", "false").set("spark.files.overwrite", "true");

  String esHost = props.getProperty(MudrodConstants.ES_UNICAST_HOSTS);
  String esPort = props.getProperty(MudrodConstants.ES_HTTP_PORT);

  if (!"".equals(esHost)) {
    conf.set("es.nodes", esHost);
  }

  if (!"".equals(esPort)) {
    conf.set("es.port", esPort);
  }

  conf.set("spark.serializer", KryoSerializer.class.getName());
  conf.set("es.batch.size.entries", "1500");

  sc = new JavaSparkContext(conf);
  sqlContext = new SQLContext(sc);
}
项目:betleopard    文件:LiveBetMain.java   
private void init() throws IOException {
    final ClientConfig config = new ClientConfig();
    client = HazelcastClient.newHazelcastClient(config);

    final SparkConf conf = new SparkConf()
            .set("hazelcast.server.addresses", "127.0.0.1:5701")
            .set("hazelcast.server.groupName", "dev")
            .set("hazelcast.server.groupPass", "dev-pass")
            .set("hazelcast.spark.valueBatchingEnabled", "true")
            .set("hazelcast.spark.readBatchSize", "5000")
            .set("hazelcast.spark.writeBatchSize", "5000");

    sc = new JavaSparkContext("local", "appname", conf);

    loadHistoricalRaces();
    createRandomUsers();
    createFutureEvent();
}
项目:MinoanER    文件:CNPNeighborsUnnormalized.java   
/**
 * 
 * @param topKvalueCandidates the topK results per entity, acquired from value similarity
 * @param rawTriples1 the rdf triples of the first entity collection
 * @param rawTriples2 the rdf triples of the second entity collection
 * @param SEPARATOR the delimiter that separates subjects, predicates and objects in the rawTriples1 and rawTriples2 files
 * @param entityIds1 the mapping of entity urls to entity ids, as it was used in blocking
 * @param entityIds2
 * @param MIN_SUPPORT_THRESHOLD the minimum support threshold, below which, relations are discarded from top relations
 * @param K the K for topK candidate matches
 * @param N the N for topN rdf relations (and neighbors)
 * @param jsc the java spark context used to load files and broadcast variables
 * @return topK neighbor candidates per entity
 */
public JavaPairRDD<Integer, IntArrayList> run(JavaPairRDD<Integer,Int2FloatLinkedOpenHashMap> topKvalueCandidates, 
        JavaRDD<String> rawTriples1, 
        JavaRDD<String> rawTriples2,             
        String SEPARATOR, 
        JavaRDD<String> entityIds1, 
        JavaRDD<String> entityIds2, 
        float MIN_SUPPORT_THRESHOLD,
        int K,
        int N, 
        JavaSparkContext jsc) {

    Map<Integer,IntArrayList> inNeighbors = new HashMap<>(new RelationsRank().run(rawTriples1, SEPARATOR, entityIds1, MIN_SUPPORT_THRESHOLD, N, true, jsc));
    inNeighbors.putAll(new RelationsRank().run(rawTriples2, SEPARATOR, entityIds2, MIN_SUPPORT_THRESHOLD, N, false, jsc));

    Broadcast<Map<Integer,IntArrayList>> inNeighbors_BV = jsc.broadcast(inNeighbors);

    //JavaPairRDD<Integer, IntArrayList> topKneighborCandidates =  getTopKNeighborSims(topKvalueCandidates, inNeighbors_BV, K);        
    JavaPairRDD<Integer, IntArrayList> topKneighborCandidates =  getTopKNeighborSimsSUM(topKvalueCandidates, inNeighbors_BV, K);        
    return topKneighborCandidates;
}
项目:MinoanER    文件:CNPARCS.java   
/**
 * 
 * @param topKvalueCandidates the topK results per entity, acquired from value similarity
 * @param rawTriples1 the rdf triples of the first entity collection
 * @param rawTriples2 the rdf triples of the second entity collection
 * @param SEPARATOR the delimiter that separates subjects, predicates and objects in the rawTriples1 and rawTriples2 files
 * @param entityIds1 the mapping of entity urls to entity ids, as it was used in blocking
 * @param entityIds2
 * @param MIN_SUPPORT_THRESHOLD the minimum support threshold, below which, relations are discarded from top relations
 * @param K the K for topK candidate matches
 * @param N the N for topN rdf relations (and neighbors)
 * @param jsc the java spark context used to load files and broadcast variables
 * @return topK neighbor candidates per entity
 */
public JavaPairRDD<Integer, Int2FloatLinkedOpenHashMap> run2(JavaPairRDD<Integer,Int2FloatLinkedOpenHashMap> topKvalueCandidates, 
        JavaRDD<String> rawTriples1, 
        JavaRDD<String> rawTriples2,             
        String SEPARATOR, 
        JavaRDD<String> entityIds1, 
        JavaRDD<String> entityIds2, 
        float MIN_SUPPORT_THRESHOLD,
        int K,
        int N, 
        JavaSparkContext jsc) {

    Map<Integer,IntArrayList> inNeighbors = new HashMap<>(new RelationsRank().run(rawTriples1, SEPARATOR, entityIds1, MIN_SUPPORT_THRESHOLD, N, true, jsc));
    inNeighbors.putAll(new RelationsRank().run(rawTriples2, SEPARATOR, entityIds2, MIN_SUPPORT_THRESHOLD, N, false, jsc));

    Broadcast<Map<Integer,IntArrayList>> inNeighbors_BV = jsc.broadcast(inNeighbors);             
    JavaPairRDD<Integer, Int2FloatLinkedOpenHashMap> topKneighborCandidates =  getTopKNeighborSimsSUMWithScores(topKvalueCandidates, inNeighbors_BV, K);        
    return topKneighborCandidates;
}
项目:MinoanER    文件:CNPNeighbors.java   
/**
 * 
 * @param topKvalueCandidates the topK results per entity, acquired from value similarity
 * @param rawTriples1 the rdf triples of the first entity collection
 * @param rawTriples2 the rdf triples of the second entity collection
 * @param SEPARATOR the delimiter that separates subjects, predicates and objects in the rawTriples1 and rawTriples2 files
 * @param entityIds1 the mapping of entity urls to entity ids, as it was used in blocking
 * @param entityIds2
 * @param MIN_SUPPORT_THRESHOLD the minimum support threshold, below which, relations are discarded from top relations
 * @param K the K for topK candidate matches
 * @param N the N for topN rdf relations (and neighbors)
 * @param jsc the java spark context used to load files and broadcast variables
 * @return topK neighbor candidates per entity
 */
public JavaPairRDD<Integer, IntArrayList> run(JavaPairRDD<Integer,Int2FloatLinkedOpenHashMap> topKvalueCandidates, 
        JavaRDD<String> rawTriples1, 
        JavaRDD<String> rawTriples2,             
        String SEPARATOR, 
        JavaRDD<String> entityIds1, 
        JavaRDD<String> entityIds2, 
        float MIN_SUPPORT_THRESHOLD,
        int K,
        int N, 
        JavaSparkContext jsc) {

    Map<Integer,IntArrayList> inNeighbors = new HashMap<>(new RelationsRank().run(rawTriples1, SEPARATOR, entityIds1, MIN_SUPPORT_THRESHOLD, N, true, jsc));
    inNeighbors.putAll(new RelationsRank().run(rawTriples2, SEPARATOR, entityIds2, MIN_SUPPORT_THRESHOLD, N, false, jsc));

    Broadcast<Map<Integer,IntArrayList>> inNeighbors_BV = jsc.broadcast(inNeighbors);

    //JavaPairRDD<Tuple2<Integer, Integer>, Float> neighborSims = getNeighborSims(topKvalueCandidates, inNeighbors_BV);        
    //JavaPairRDD<Integer, IntArrayList> topKneighborCandidates =  getTopKNeighborSimsOld(neighborSims, K);        
    JavaPairRDD<Integer, IntArrayList> topKneighborCandidates =  getTopKNeighborSims(topKvalueCandidates, inNeighbors_BV, K);        
    return topKneighborCandidates;
}
项目:oryx2    文件:ExampleBatchLayerUpdate.java   
@Override
public void runUpdate(JavaSparkContext sparkContext,
                      long timestamp,
                      JavaPairRDD<String,String> newData,
                      JavaPairRDD<String,String> pastData,
                      String modelDirString,
                      TopicProducer<String,String> modelUpdateTopic) throws IOException {
  JavaPairRDD<String,String> allData = pastData == null ? newData : newData.union(pastData);
  String modelString;
  try {
    modelString = new ObjectMapper().writeValueAsString(countDistinctOtherWords(allData));
  } catch (JsonProcessingException jpe) {
    throw new IOException(jpe);
  }
  modelUpdateTopic.send("MODEL", modelString);
}
项目:MinoanER    文件:BlocksFromEntityIndexTest.java   
@Before
public void setUp() {
    System.setProperty("hadoop.home.dir", "C:\\Users\\VASILIS\\Documents\\hadoop_home"); //only for local mode

    spark = SparkSession.builder()
        .appName("test") 
        .config("spark.sql.warehouse.dir", "/file:/tmp")                
        .config("spark.executor.instances", 1)
        .config("spark.executor.cores", 1)
        .config("spark.executor.memory", "1G")            
        .config("spark.driver.maxResultSize", "1g")
        .config("spark.master", "local")
        .getOrCreate();        



    jsc = JavaSparkContext.fromSparkContext(spark.sparkContext()); 
}
项目:MinoanER    文件:BlockFilteringAdvancedTest.java   
@Before
public void setUp() {        
    System.setProperty("hadoop.home.dir", "C:\\Users\\VASILIS\\Documents\\hadoop_home"); //only for local mode

    spark = SparkSession.builder()
        .appName("test") 
        .config("spark.sql.warehouse.dir", "/file:/tmp")                
        .config("spark.executor.instances", 1)
        .config("spark.executor.cores", 1)
        .config("spark.executor.memory", "1G")            
        .config("spark.driver.maxResultSize", "1g")
        .config("spark.master", "local")
        .getOrCreate();        



    jsc = JavaSparkContext.fromSparkContext(spark.sparkContext()); 
}
项目:ParquetUtils    文件:ParquetRepartTest.java   
@BeforeClass
public static void createContext() throws IOException {

    Configuration hdfsConfig = HDFSUtils.getConfiguration();
    SparkConf config = new SparkConf();
    config.setMaster("local[*]");
    config.setAppName("my JUnit running Spark");
    sc = new JavaSparkContext(config);
    fileSystem = FileSystem.get(hdfsConfig);
    sqlContext = new SQLContext(sc);
    engine = new ParquetRepartEngine(fileSystem, sqlContext);
}
项目:oryx2    文件:ALSUpdate.java   
private static MatrixFactorizationModel pmmlToMFModel(JavaSparkContext sparkContext,
                                                      PMML pmml,
                                                      Path modelParentPath,
                                                      Broadcast<Map<String,Integer>> bUserIDToIndex,
                                                      Broadcast<Map<String,Integer>> bItemIDToIndex) {
  String xPathString = AppPMMLUtils.getExtensionValue(pmml, "X");
  String yPathString = AppPMMLUtils.getExtensionValue(pmml, "Y");
  JavaPairRDD<String,float[]> userRDD = readFeaturesRDD(sparkContext, new Path(modelParentPath, xPathString));
  JavaPairRDD<String,float[]> productRDD = readFeaturesRDD(sparkContext, new Path(modelParentPath, yPathString));
  int rank = userRDD.first()._2().length;
  return new MatrixFactorizationModel(
      rank,
      readAndConvertFeatureRDD(userRDD, bUserIDToIndex),
      readAndConvertFeatureRDD(productRDD, bItemIDToIndex));
}
项目:mutantpdb    文件:App.java   
public static void main( String[] args )
{
    Dataset<Row> mutations = DataProvider.getMutationsToStructures();
    List<String> pdbIds = mutations.select(col("pdbId"))
            .distinct().toJavaRDD().map(t -> t.getString(0)).collect();

    List<Row> broadcasted = mutations.select("pdbId", "chainId", "pdbAtomPos").collectAsList();
    SaprkUtils.stopSparkSession();

    JavaSparkContext sc = SaprkUtils.getSparkContext();
    Broadcast<List<Row>> bcmut = sc.broadcast(broadcasted);

    MmtfReader//.readSequenceFile("/pdb/2017/full", pdbIds, sc)
            .downloadMmtfFiles(Arrays.asList("5IRC"), sc)
            .flatMapToPair(new StructureToPolymerChains())
            .flatMapToPair(new AddResidueToKey(bcmut))
            .mapValues(new StructureToBioJava())
            .mapToPair(new FilterResidue())
            .filter(t -> t._2!=null).keys()
            .map(t -> t.replace(".", ","))
            .saveAsTextFile("/Users/yana/git/mutantpdb/src/main/resources/pdb_residues");
    sc.close();
}
项目:BLASpark    文件:OtherOperations.java   
public static DistributedMatrix[] GetLU(DistributedMatrix A, boolean diagonalInL, boolean diagonalInU, JavaSparkContext jsc) {

        if((diagonalInL && diagonalInU) || (!diagonalInL && !diagonalInU)) {
            LOG.error("Diagonal values must be in either upper or lower matrices");
            System.exit(-1);
        }

        DistributedMatrix[] returnedMatrices;

        if( A.getClass() == IndexedRowMatrix.class) {
            returnedMatrices = OtherOperations.GetLU_IRW((IndexedRowMatrix) A, diagonalInL, diagonalInU, jsc);
        }
        else if (A.getClass() == CoordinateMatrix.class) {
            returnedMatrices = OtherOperations.GetLU_COORD((CoordinateMatrix) A, diagonalInL, diagonalInU, jsc);
        }
        else if (A.getClass() == BlockMatrix.class){
            // TODO: Implement this operation
            //returnedMatrices = OtherOperations.GetLU_BCK((BlockMatrix) A, diagonalInL, diagonalInU, jsc);
            returnedMatrices = null;
        }
        else {
            returnedMatrices = null;
        }


        return returnedMatrices;

    }
项目:BLASpark    文件:L2.java   
public static DenseVector DGEMV(double alpha, DistributedMatrix A, DenseVector x, double beta, DenseVector y, JavaSparkContext jsc){

        // First form  y := beta*y.
        if (beta != 1.0) {
            if (beta == 0.0) {
                y = Vectors.zeros(y.size()).toDense();
            }
            else {
                BLAS.scal(beta, y);
            }
        }

        if (alpha == 0.0) {
            return y;
        }

        DenseVector tmpVector = Vectors.zeros(y.size()).toDense();

        // Form  y := alpha*A*x + y.
        // Case of IndexedRowMatrix
        if( A.getClass() == IndexedRowMatrix.class) {
            tmpVector = L2.DGEMV_IRW((IndexedRowMatrix) A, alpha, x, jsc);
        }
        else if (A.getClass() == CoordinateMatrix.class) {
            tmpVector = L2.DGEMV_COORD((CoordinateMatrix) A, alpha, x, jsc);
        }
        else if (A.getClass() == BlockMatrix.class){
            tmpVector = L2.DGEMV_BCK((BlockMatrix) A, alpha, x, jsc);
        }
        else {
            tmpVector = null;
        }

        BLAS.axpy(1.0, tmpVector, y);


        return y;

    }
项目:BLASpark    文件:L2.java   
private static DenseVector DGEMV_COORD(CoordinateMatrix matrix, double alpha, DenseVector vector, JavaSparkContext jsc) {

        JavaRDD<MatrixEntry> items = matrix.entries().toJavaRDD();
        DenseVector result = items.mapPartitions(new MatrixEntriesMultiplication(vector, alpha))
                .reduce(new MatrixEntriesMultiplicationReducer());

        return result;
    }
项目:gcp    文件:Spark1Batch.java   
public static void main(String[] args) {
  SparkConf sc = new SparkConf().setAppName("POC-Batch");
  try(JavaSparkContext jsc = new JavaSparkContext(sc)) {

    JavaRDD<ExampleXML> records = jsc.wholeTextFiles("input/")
        .map(t -> t._2())
        .map(new ParseXML());

    System.out.printf("Amount of XMLs: %d\n", records.count());
  }
}
项目:athena    文件:MachineLearningManagerImpl.java   
public DecisionTreeValidationSummary validateDecisionTreeAthenaFeatures(JavaSparkContext sc,
                                                                        FeatureConstraint featureConstraint,
                                                                        AthenaMLFeatureConfiguration athenaMLFeatureConfiguration,
                                                                        DecisionTreeDetectionModel decisionTreeDetectionModel,
                                                                        Indexing indexing, Marking marking) {
    long start = System.nanoTime(); // <-- start

    JavaPairRDD<Object, BSONObject> mongoRDD;
    mongoRDD = sc.newAPIHadoopRDD(
            mongodbConfig,            // Configuration
            MongoInputFormat.class,   // InputFormat: read from a live cluster.
            Object.class,             // Key class
            BSONObject.class          // Value class
    );

    DecisionTreeDetectionAlgorithm decisionTreeDetectionAlgorithm = (DecisionTreeDetectionAlgorithm) decisionTreeDetectionModel.getDetectionAlgorithm();

    DecisionTreeValidationSummary decisionTreeValidationSummary =
            new DecisionTreeValidationSummary(sc.sc(), decisionTreeDetectionAlgorithm.getNumClasses(), indexing, marking);

    DecisionTreeDistJob decisionTreeDistJob = new DecisionTreeDistJob();

    decisionTreeDistJob.validate(mongoRDD,
            athenaMLFeatureConfiguration,
            decisionTreeDetectionModel,
            decisionTreeValidationSummary);


    long end = System.nanoTime(); // <-- start
    long time = end - start;

    decisionTreeValidationSummary.setTotalValidationTime(time);
    return decisionTreeValidationSummary;
}
项目:athena    文件:MachineLearningManagerImpl.java   
public GaussianMixtureValidationSummary validateGaussianMixtureAthenaFeatures(JavaSparkContext sc,
                                                                              FeatureConstraint featureConstraint,
                                                                              AthenaMLFeatureConfiguration athenaMLFeatureConfiguration,
                                                                              GaussianMixtureDetectionModel gaussianMixtureDetectionModel,
                                                                              Indexing indexing,
                                                                              Marking marking) {
    long start = System.nanoTime(); // <-- start

    JavaPairRDD<Object, BSONObject> mongoRDD;
    mongoRDD = sc.newAPIHadoopRDD(
            mongodbConfig,            // Configuration
            MongoInputFormat.class,   // InputFormat: read from a live cluster.
            Object.class,             // Key class
            BSONObject.class          // Value class
    );

    GaussianMixtureDetectionAlgorithm gaussianMixtureDetectionAlgorithm = (GaussianMixtureDetectionAlgorithm) gaussianMixtureDetectionModel.getDetectionAlgorithm();
    GaussianMixtureValidationSummary gaussianMixtureValidationSummary =
            new GaussianMixtureValidationSummary(sc.sc(), gaussianMixtureDetectionAlgorithm.getK(), indexing, marking);


    GaussianMixtureDistJob gaussianMixtureDistJob = new GaussianMixtureDistJob();

    gaussianMixtureDistJob.validate(mongoRDD,
            athenaMLFeatureConfiguration,
            gaussianMixtureDetectionModel,
            gaussianMixtureValidationSummary);


    long end = System.nanoTime(); // <-- start
    long time = end - start;

    gaussianMixtureValidationSummary.setTotalValidationTime(time);
    return gaussianMixtureValidationSummary;
}
项目:Java-Data-Science-Cookbook    文件:ScalaTest.java   
public static void main( String[] args ){
    String inputFile = "data/dummy.txt";
    SparkConf configuration = new SparkConf().setMaster("local[4]").setAppName("My App");
    JavaSparkContext sparkContext = new JavaSparkContext(configuration);
    JavaRDD<String> logData = sparkContext.textFile(inputFile).cache();

    long numberA = logData.filter(new Function<String,Boolean>(){
        private static final long serialVersionUID = 1L;
        public Boolean call(String s){
            return s.length() == 0;
        }
    }).count();
    sparkContext.close();
    System.out.println("Empty Lines: " + numberA);
}
项目:athena    文件:MachineLearningManagerImpl.java   
public LassoValidationSummary validateLassoAthenaFeatures(JavaSparkContext sc,
                                                          FeatureConstraint featureConstraint,
                                                          AthenaMLFeatureConfiguration athenaMLFeatureConfiguration,
                                                          LassoDetectionModel lassoDetectionModel,
                                                          Indexing indexing, Marking marking) {
    long start = System.nanoTime(); // <-- start

    JavaPairRDD<Object, BSONObject> mongoRDD;
    mongoRDD = sc.newAPIHadoopRDD(
            mongodbConfig,            // Configuration
            MongoInputFormat.class,   // InputFormat: read from a live cluster.
            Object.class,             // Key class
            BSONObject.class          // Value class
    );

    LassoDetectionAlgorithm lassoDetectionAlgorithm =
            (LassoDetectionAlgorithm) lassoDetectionModel.getDetectionAlgorithm();

    LassoValidationSummary lassoValidationSummary = new LassoValidationSummary();
    lassoValidationSummary.setLassoDetectionAlgorithm(lassoDetectionAlgorithm);
    LassoDistJob lassoDistJob = new LassoDistJob();

    lassoDistJob.validate(mongoRDD,
            athenaMLFeatureConfiguration,
            lassoDetectionModel,
            lassoValidationSummary);


    long end = System.nanoTime(); // <-- start
    long time = end - start;
    lassoValidationSummary.setValidationTime(time);

    return lassoValidationSummary;
}
项目:athena    文件:MachineLearningManagerImpl.java   
public RidgeRegressionValidationSummary validateRidgeRegressionAthenaFeatures(JavaSparkContext sc,
                                                                              FeatureConstraint featureConstraint,
                                                                              AthenaMLFeatureConfiguration athenaMLFeatureConfiguration,
                                                                              RidgeRegressionDetectionModel ridgeRegressionDetectionModel,
                                                                              Indexing indexing, Marking marking) {
    long start = System.nanoTime(); // <-- start

    JavaPairRDD<Object, BSONObject> mongoRDD;
    mongoRDD = sc.newAPIHadoopRDD(
            mongodbConfig,            // Configuration
            MongoInputFormat.class,   // InputFormat: read from a live cluster.
            Object.class,             // Key class
            BSONObject.class          // Value class
    );

    RidgeRegressionDetectionAlgorithm ridgeRegressionDetectionAlgorithm =
            (RidgeRegressionDetectionAlgorithm) ridgeRegressionDetectionModel.getDetectionAlgorithm();

    RidgeRegressionValidationSummary ridgeRegressionValidationSummary = new RidgeRegressionValidationSummary();
    ridgeRegressionValidationSummary.setRidgeRegressionDetectionAlgorithm(ridgeRegressionDetectionAlgorithm);
    RidgeRegressionDistJob ridgeRegressionDistJob = new RidgeRegressionDistJob();

    ridgeRegressionDistJob.validate(mongoRDD,
            athenaMLFeatureConfiguration,
            ridgeRegressionDetectionModel,
            ridgeRegressionValidationSummary);


    long end = System.nanoTime(); // <-- start
    long time = end - start;
    ridgeRegressionValidationSummary.setValidationTime(time);
    return ridgeRegressionValidationSummary;
}
项目:ViraPipe    文件:Interleave.java   
private static void splitFastq(FileStatus fst, String fqPath, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {
  Path fqpath = new Path(fqPath);
  String fqname = fqpath.getName();
  String[] ns = fqname.split("\\.");
  List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);

  JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);

  splitRDD.foreach( split ->  {

    FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), split);
    writeFastqFile(fqreader, new Configuration(), splitDir + "/split_" + split.getStart() + "." + ns[1]);

   });
}
项目:ViraPipe    文件:Interleave.java   
public static void interleaveSplitFastq(FileStatus fst, FileStatus fst2, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {

    List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);
    List<FileSplit> nlif2 = NLineInputFormat.getSplitsForFile(fst2, sc.hadoopConfiguration(), splitlen);

    JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);
    JavaRDD<FileSplit> splitRDD2 = sc.parallelize(nlif2);
    JavaPairRDD<FileSplit, FileSplit> zips = splitRDD.zip(splitRDD2);

    zips.foreach( splits ->  {
      Path path = splits._1.getPath();
      FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), splits._1);
      FastqRecordReader fqreader2 = new FastqRecordReader(new Configuration(), splits._2);
      writeInterleavedSplits(fqreader, fqreader2, new Configuration(), splitDir+"/"+path.getParent().getName()+"_"+splits._1.getStart()+".fq");
    });
  }
项目:Explainer    文件:SparkUtils.java   
public static SparkUtils getInstance() {
  SparkUtils result = instance;
  if (result == null) {
    synchronized (SparkUtils.class) {
      result = instance;
      if (result == null) {
        instance = result = new SparkUtils();
        SparkConf sparkConf = new SparkConf().setMaster("local").setAppName("Explainer");
        jsc = new JavaSparkContext(sparkConf);
        sqlContext = new SQLContext(jsc);
      }
    }
  }
  return result;
}
项目:oryx2    文件:ALSUpdate.java   
@Override
public void publishAdditionalModelData(JavaSparkContext sparkContext,
                                       PMML pmml,
                                       JavaRDD<String> newData,
                                       JavaRDD<String> pastData,
                                       Path modelParentPath,
                                       TopicProducer<String, String> modelUpdateTopic) {
  // Send item updates first, before users. That way, user-based endpoints like /recommend
  // may take longer to not return 404, but when they do, the result will be more complete.
  log.info("Sending item / Y data as model updates");
  String yPathString = AppPMMLUtils.getExtensionValue(pmml, "Y");
  JavaPairRDD<String,float[]> productRDD = readFeaturesRDD(sparkContext, new Path(modelParentPath, yPathString));

  String updateBroker = modelUpdateTopic.getUpdateBroker();
  String topic = modelUpdateTopic.getTopic();

  // For now, there is no use in sending known users for each item
  productRDD.foreachPartition(new EnqueueFeatureVecsFn("Y", updateBroker, topic));

  log.info("Sending user / X data as model updates");
  String xPathString = AppPMMLUtils.getExtensionValue(pmml, "X");
  JavaPairRDD<String,float[]> userRDD = readFeaturesRDD(sparkContext, new Path(modelParentPath, xPathString));

  if (noKnownItems) {
    userRDD.foreachPartition(new EnqueueFeatureVecsFn("X", updateBroker, topic));
  } else {
    log.info("Sending known item data with model updates");
    JavaRDD<String[]> allData =
        (pastData == null ? newData : newData.union(pastData)).map(MLFunctions.PARSE_FN);
    JavaPairRDD<String,Collection<String>> knownItems = knownsRDD(allData, true);
    userRDD.join(knownItems).foreachPartition(
        new EnqueueFeatureVecsAndKnownItemsFn("X", updateBroker, topic));
  }
}
项目:athena    文件:MachineLearningManagerImpl.java   
public NaiveBayesValidationSummary validateNaiveBayesAthenaFeatures(JavaSparkContext sc,
                                                                    FeatureConstraint featureConstraint,
                                                                    AthenaMLFeatureConfiguration athenaMLFeatureConfiguration,
                                                                    NaiveBayesDetectionModel naiveBayesDetectionModel,
                                                                    Indexing indexing, Marking marking) {
    long start = System.nanoTime(); // <-- start

    JavaPairRDD<Object, BSONObject> mongoRDD;
    mongoRDD = sc.newAPIHadoopRDD(
            mongodbConfig,            // Configuration
            MongoInputFormat.class,   // InputFormat: read from a live cluster.
            Object.class,             // Key class
            BSONObject.class          // Value class
    );

    NaiveBayesDetectionAlgorithm naiveBayesDetectionAlgorithm = (NaiveBayesDetectionAlgorithm) naiveBayesDetectionModel.getDetectionAlgorithm();

    NaiveBayesValidationSummary naiveBayesValidationSummary =
            new NaiveBayesValidationSummary(sc.sc(), naiveBayesDetectionAlgorithm.getNumClasses(), indexing, marking);

    NaiveBayesDistJob naiveBayesDistJob = new NaiveBayesDistJob();


    naiveBayesDistJob.validate(mongoRDD,
            athenaMLFeatureConfiguration,
            naiveBayesDetectionModel,
            naiveBayesValidationSummary);


    long end = System.nanoTime(); // <-- start
    long time = end - start;

    naiveBayesValidationSummary.setTotalValidationTime(time);
    return naiveBayesValidationSummary;
}
项目:athena    文件:MachineLearningManagerImpl.java   
public GradientBoostedTreesValidationSummary validateGradientBoostedTreesAthenaFeatures(JavaSparkContext sc,
                                                                                        FeatureConstraint featureConstraint,
                                                                                        AthenaMLFeatureConfiguration athenaMLFeatureConfiguration,
                                                                                        GradientBoostedTreesDetectionModel gradientBoostedTreesDetectionModel,
                                                                                        Indexing indexing, Marking marking) {
    long start = System.nanoTime(); // <-- start

    JavaPairRDD<Object, BSONObject> mongoRDD;
    mongoRDD = sc.newAPIHadoopRDD(
            mongodbConfig,            // Configuration
            MongoInputFormat.class,   // InputFormat: read from a live cluster.
            Object.class,             // Key class
            BSONObject.class          // Value class
    );

    GradientBoostedTreesDetectionAlgorithm gradientBoostedTreesDetectionAlgorithm =
            (GradientBoostedTreesDetectionAlgorithm) gradientBoostedTreesDetectionModel.getDetectionAlgorithm();

    GradientBoostedTreesValidationSummary gradientBoostedTreesValidationSummary =
            new GradientBoostedTreesValidationSummary(sc.sc(),
                    gradientBoostedTreesDetectionAlgorithm.getNumClasses(), indexing, marking);

    GradientBoostedTreesDistJob gradientBoostedTreesDistJob = new GradientBoostedTreesDistJob();

    gradientBoostedTreesDistJob.validate(mongoRDD,
            athenaMLFeatureConfiguration,
            gradientBoostedTreesDetectionModel,
            gradientBoostedTreesValidationSummary);


    long end = System.nanoTime(); // <-- start
    long time = end - start;

    gradientBoostedTreesValidationSummary.setTotalValidationTime(time);
    return gradientBoostedTreesValidationSummary;
}
项目:movie-recommender    文件:SparkModule.java   
@Provides
CassandraIo<RawRating> providesCassandraRatingIO(JavaSparkContext sparkContext) {
    if (ratingCassandraIo != null) {
        return ratingCassandraIo;
    }
    ratingCassandraIo = new CassandraIo<>(RawRating.class, "dev", "ratings");
    ratingCassandraIo.setSparkContext(sparkContext);


    return ratingCassandraIo;
}
项目:metadata-qa-marc    文件:ParallelValidator.java   
public static void main(String[] args) throws ParseException {

        final Validator validator = new Validator(args);
        ValidatorParameters params = validator.getParameters();
        validator.setDoPrintInProcessRecord(false);

        logger.info("Input file is " + params.getArgs());
        SparkConf conf = new SparkConf().setAppName("MarcCompletenessCount");
        JavaSparkContext context = new JavaSparkContext(conf);

        System.err.println(validator.getParameters().formatParameters());

        JavaRDD<String> inputFile = context.textFile(validator.getParameters().getArgs()[0]);

        JavaRDD<String> baseCountsRDD = inputFile
            .flatMap(content -> {
                MarcReader reader = ReadMarc.getMarcStringReader(content);
                Record marc4jRecord = reader.next();
                MarcRecord marcRecord = MarcFactory.createFromMarc4j(
                    marc4jRecord, params.getDefaultRecordType(), params.getMarcVersion(), params.fixAlephseq());
                validator.processRecord(marcRecord, 1);
                return ValidationErrorFormatter
                    .formatForSummary(marcRecord.getValidationErrors(), params.getFormat())
                    .iterator();
            }
        );
        baseCountsRDD.saveAsTextFile(validator.getParameters().getFileName());
    }
项目:athena    文件:MachineLearningManagerImpl.java   
public LinearRegressionValidationSummary validateLinearRegressionAthenaFeatures(JavaSparkContext sc,
                                                                                FeatureConstraint featureConstraint,
                                                                                AthenaMLFeatureConfiguration athenaMLFeatureConfiguration,
                                                                                LinearRegressionDetectionModel linearRegressionDetectionModel,
                                                                                Indexing indexing, Marking marking) {
    long start = System.nanoTime(); // <-- start

    JavaPairRDD<Object, BSONObject> mongoRDD;
    mongoRDD = sc.newAPIHadoopRDD(
            mongodbConfig,            // Configuration
            MongoInputFormat.class,   // InputFormat: read from a live cluster.
            Object.class,             // Key class
            BSONObject.class          // Value class
    );

    LinearRegressionDetectionAlgorithm linearRegressionDetectionAlgorithm =
            (LinearRegressionDetectionAlgorithm) linearRegressionDetectionModel.getDetectionAlgorithm();

    LinearRegressionValidationSummary linearRegressionValidationSummary =
            new LinearRegressionValidationSummary();
    linearRegressionValidationSummary.setLinearRegressionDetectionAlgorithm(linearRegressionDetectionAlgorithm);

    LinearRegressionDistJob linearRegressionDistJob = new LinearRegressionDistJob();

    linearRegressionDistJob.validate(mongoRDD,
            athenaMLFeatureConfiguration,
            linearRegressionDetectionModel,
            linearRegressionValidationSummary);


    long end = System.nanoTime(); // <-- start
    long time = end - start;
    linearRegressionValidationSummary.setValidationTime(time);


    return linearRegressionValidationSummary;
}
项目:rdf2x    文件:TestUtils.java   
/**
 * Parse RDF file from resources folder
 * @param sc spark context to use for parsing
 * @param fileName name of the file to parse
 * @return RDD of quads from the requested file
 */
public static JavaRDD<Quad> getQuadsRDD(JavaSparkContext sc, String fileName) {
    QuadParser parser = new ElephasQuadParser(
            new QuadParserConfig()
                    .setBatchSize(2),
            sc
    );
    String path = TestUtils.getDatasetPath(fileName);
    return parser.parseQuads(path);
}