public static void main(String[] args) { if (args.length != 2) { System.err.println("Usage:"); System.err.println(" SparkWordCount <sourceFile> <targetFile>"); System.exit(1); } SparkConf conf = new SparkConf() .setAppName("Word Count"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> textFile = sc.textFile(args[0]); JavaRDD<String> words = textFile.flatMap(LineIterator::new); JavaPairRDD<String, Long> pairs = words.mapToPair(s -> new Tuple2<>(s, 1L)); JavaPairRDD<String, Long> counts = pairs.reduceByKey((Function2<Long, Long, Long>) (a, b) -> a + b); System.out.println("Starting task.."); long t = System.currentTimeMillis(); counts.saveAsTextFile(args[1] + "_" + t); System.out.println("Time=" + (System.currentTimeMillis() - t)); }
/** * Initializes a Spark connection. Use it afterwards for execution of Spark * SQL queries. * * @param appName * the name of the app that will be used with this Spark * connection * @param database * name of the database that will be used with this Spark * connection */ public Spark(String appName, String database) { // TODO check what will happen if there is already in use the same app // name this.sparkConfiguration = new SparkConf().setAppName(appName); this.javaContext = new JavaSparkContext(sparkConfiguration); this.hiveContext = new HiveContext(javaContext); // TODO check what kind of exception can be thrown here if there is a // problem with spark connection this.hiveContext.sql(String.format("CREATE DATABASE %s", database)); // TODO check what kind of exception is thrown if database already // use the created database this.hiveContext.sql((String.format("USE %s", database))); }
/** * @param sparkContext active Spark Context * @param trainData training data on which to build a model * @param hyperParameters ordered list of hyper parameter values to use in building model * @param candidatePath directory where additional model files can be written * @return a {@link PMML} representation of a model trained on the given data */ @Override public PMML buildModel(JavaSparkContext sparkContext, JavaRDD<String> trainData, List<?> hyperParameters, Path candidatePath) { int numClusters = (Integer) hyperParameters.get(0); Preconditions.checkArgument(numClusters > 1); log.info("Building KMeans Model with {} clusters", numClusters); JavaRDD<Vector> trainingData = parsedToVectorRDD(trainData.map(MLFunctions.PARSE_FN)); KMeansModel kMeansModel = KMeans.train(trainingData.rdd(), numClusters, maxIterations, numberOfRuns, initializationStrategy); return kMeansModelToPMML(kMeansModel, fetchClusterCountsFromModel(trainingData, kMeansModel)); }
public static DistributedMatrix GetLU(DistributedMatrix A, JavaSparkContext jsc) { DistributedMatrix returnedMatrix; if( A.getClass() == IndexedRowMatrix.class) { returnedMatrix = OtherOperations.GetLU_IRW((IndexedRowMatrix) A); } else if (A.getClass() == CoordinateMatrix.class) { returnedMatrix = OtherOperations.GetLU_COORD((CoordinateMatrix) A); } else if (A.getClass() == BlockMatrix.class){ // TODO: Implement this operation //returnedMatrices = OtherOperations.GetLU_BCK((BlockMatrix) A, diagonalInL, diagonalInU, jsc); returnedMatrix = null; } else { returnedMatrix = null; } return returnedMatrix; }
public static DistributedMatrix GetD(DistributedMatrix A, boolean inverseValues, JavaSparkContext jsc) { DistributedMatrix returnedMatrix; if( A.getClass() == IndexedRowMatrix.class) { returnedMatrix = OtherOperations.GetD_IRW((IndexedRowMatrix) A, inverseValues, jsc); } else if (A.getClass() == CoordinateMatrix.class) { returnedMatrix = OtherOperations.GetD_COORD((CoordinateMatrix) A, inverseValues, jsc); } else if (A.getClass() == BlockMatrix.class){ // TODO: Implement this operation //returnedMatrices = OtherOperations.GetLU_BCK((BlockMatrix) A, diagonalInL, diagonalInU, jsc); returnedMatrix = null; } else { returnedMatrix = null; } return returnedMatrix; }
private static void splitFastq(FileStatus fst, String fqPath, String splitDir, int splitlen, JavaSparkContext sc) throws IOException { Path fqpath = new Path(fqPath); String fqname = fqpath.getName(); String[] ns = fqname.split("\\."); //TODO: Handle also compressed files List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen); JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif); splitRDD.foreach( split -> { FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), split); writeFastqFile(fqreader, new Configuration(), splitDir + "/split_" + split.getStart() + "." + ns[1]); }); }
public static void interleaveSplitFastq(FileStatus fst, FileStatus fst2, String splitDir, int splitlen, JavaSparkContext sc) throws IOException { List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen); List<FileSplit> nlif2 = NLineInputFormat.getSplitsForFile(fst2, sc.hadoopConfiguration(), splitlen); JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif); JavaRDD<FileSplit> splitRDD2 = sc.parallelize(nlif2); JavaPairRDD<FileSplit, FileSplit> zips = splitRDD.zip(splitRDD2); zips.foreach( splits -> { Path path = splits._1.getPath(); FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), splits._1); FastqRecordReader fqreader2 = new FastqRecordReader(new Configuration(), splits._2); writeInterleavedSplits(fqreader, fqreader2, new Configuration(), splitDir+"/"+path.getParent().getName()+"_"+splits._1.getStart()+".fq"); }); }
public static void wordCountJava8( String filename ) { // Define a configuration to use to interact with Spark SparkConf conf = new SparkConf().setMaster("local").setAppName("Work Count App"); // Create a Java version of the Spark Context from the configuration JavaSparkContext sc = new JavaSparkContext(conf); // Load the input data, which is a text file read from the command line JavaRDD<String> input = sc.textFile( filename ); // Java 8 with lambdas: split the input string into words // TODO here a change has happened JavaRDD<String> words = input.flatMap( s -> Arrays.asList( s.split( " " ) ).iterator() ); // Java 8 with lambdas: transform the collection of words into pairs (word and 1) and then count them JavaPairRDD<Object, Object> counts = words.mapToPair( t -> new Tuple2( t, 1 ) ).reduceByKey( (x, y) -> (int)x + (int)y ); // Save the word count back out to a text file, causing evaluation. counts.saveAsTextFile( "output" ); }
public static void main(String[] args) throws Exception { System.out.println(System.getProperty("hadoop.home.dir")); String inputPath = args[0]; String outputPath = args[1]; FileUtils.deleteQuietly(new File(outputPath)); JavaSparkContext sc = new JavaSparkContext("local", "sparkwordcount"); JavaRDD<String> rdd = sc.textFile(inputPath); JavaPairRDD<String, Integer> counts = rdd .flatMap(x -> Arrays.asList(x.split(" ")).iterator()) .mapToPair(x -> new Tuple2<String, Integer>((String) x, 1)) .reduceByKey((x, y) -> x + y); counts.saveAsTextFile(outputPath); sc.close(); }
/** * loadClickStremFromTxt:Load click stream form txt file * * @param clickthroughFile * txt file * @param sc * the spark context * @return clickstream list in JavaRDD format {@link ClickStream} */ public JavaRDD<ClickStream> loadClickStremFromTxt(String clickthroughFile, JavaSparkContext sc) { return sc.textFile(clickthroughFile).flatMap(new FlatMapFunction<String, ClickStream>() { /** * */ private static final long serialVersionUID = 1L; @SuppressWarnings("unchecked") @Override public Iterator<ClickStream> call(String line) throws Exception { List<ClickStream> clickthroughs = (List<ClickStream>) ClickStream.parseFromTextLine(line); return (Iterator<ClickStream>) clickthroughs; } }); }
public SparkDriver(Properties props) { SparkConf conf = new SparkConf().setAppName(props.getProperty(MudrodConstants.SPARK_APP_NAME, "MudrodSparkApp")).setIfMissing("spark.master", props.getProperty(MudrodConstants.SPARK_MASTER)) .set("spark.hadoop.validateOutputSpecs", "false").set("spark.files.overwrite", "true"); String esHost = props.getProperty(MudrodConstants.ES_UNICAST_HOSTS); String esPort = props.getProperty(MudrodConstants.ES_HTTP_PORT); if (!"".equals(esHost)) { conf.set("es.nodes", esHost); } if (!"".equals(esPort)) { conf.set("es.port", esPort); } conf.set("spark.serializer", KryoSerializer.class.getName()); conf.set("es.batch.size.entries", "1500"); sc = new JavaSparkContext(conf); sqlContext = new SQLContext(sc); }
private void init() throws IOException { final ClientConfig config = new ClientConfig(); client = HazelcastClient.newHazelcastClient(config); final SparkConf conf = new SparkConf() .set("hazelcast.server.addresses", "127.0.0.1:5701") .set("hazelcast.server.groupName", "dev") .set("hazelcast.server.groupPass", "dev-pass") .set("hazelcast.spark.valueBatchingEnabled", "true") .set("hazelcast.spark.readBatchSize", "5000") .set("hazelcast.spark.writeBatchSize", "5000"); sc = new JavaSparkContext("local", "appname", conf); loadHistoricalRaces(); createRandomUsers(); createFutureEvent(); }
/** * * @param topKvalueCandidates the topK results per entity, acquired from value similarity * @param rawTriples1 the rdf triples of the first entity collection * @param rawTriples2 the rdf triples of the second entity collection * @param SEPARATOR the delimiter that separates subjects, predicates and objects in the rawTriples1 and rawTriples2 files * @param entityIds1 the mapping of entity urls to entity ids, as it was used in blocking * @param entityIds2 * @param MIN_SUPPORT_THRESHOLD the minimum support threshold, below which, relations are discarded from top relations * @param K the K for topK candidate matches * @param N the N for topN rdf relations (and neighbors) * @param jsc the java spark context used to load files and broadcast variables * @return topK neighbor candidates per entity */ public JavaPairRDD<Integer, IntArrayList> run(JavaPairRDD<Integer,Int2FloatLinkedOpenHashMap> topKvalueCandidates, JavaRDD<String> rawTriples1, JavaRDD<String> rawTriples2, String SEPARATOR, JavaRDD<String> entityIds1, JavaRDD<String> entityIds2, float MIN_SUPPORT_THRESHOLD, int K, int N, JavaSparkContext jsc) { Map<Integer,IntArrayList> inNeighbors = new HashMap<>(new RelationsRank().run(rawTriples1, SEPARATOR, entityIds1, MIN_SUPPORT_THRESHOLD, N, true, jsc)); inNeighbors.putAll(new RelationsRank().run(rawTriples2, SEPARATOR, entityIds2, MIN_SUPPORT_THRESHOLD, N, false, jsc)); Broadcast<Map<Integer,IntArrayList>> inNeighbors_BV = jsc.broadcast(inNeighbors); //JavaPairRDD<Integer, IntArrayList> topKneighborCandidates = getTopKNeighborSims(topKvalueCandidates, inNeighbors_BV, K); JavaPairRDD<Integer, IntArrayList> topKneighborCandidates = getTopKNeighborSimsSUM(topKvalueCandidates, inNeighbors_BV, K); return topKneighborCandidates; }
/** * * @param topKvalueCandidates the topK results per entity, acquired from value similarity * @param rawTriples1 the rdf triples of the first entity collection * @param rawTriples2 the rdf triples of the second entity collection * @param SEPARATOR the delimiter that separates subjects, predicates and objects in the rawTriples1 and rawTriples2 files * @param entityIds1 the mapping of entity urls to entity ids, as it was used in blocking * @param entityIds2 * @param MIN_SUPPORT_THRESHOLD the minimum support threshold, below which, relations are discarded from top relations * @param K the K for topK candidate matches * @param N the N for topN rdf relations (and neighbors) * @param jsc the java spark context used to load files and broadcast variables * @return topK neighbor candidates per entity */ public JavaPairRDD<Integer, Int2FloatLinkedOpenHashMap> run2(JavaPairRDD<Integer,Int2FloatLinkedOpenHashMap> topKvalueCandidates, JavaRDD<String> rawTriples1, JavaRDD<String> rawTriples2, String SEPARATOR, JavaRDD<String> entityIds1, JavaRDD<String> entityIds2, float MIN_SUPPORT_THRESHOLD, int K, int N, JavaSparkContext jsc) { Map<Integer,IntArrayList> inNeighbors = new HashMap<>(new RelationsRank().run(rawTriples1, SEPARATOR, entityIds1, MIN_SUPPORT_THRESHOLD, N, true, jsc)); inNeighbors.putAll(new RelationsRank().run(rawTriples2, SEPARATOR, entityIds2, MIN_SUPPORT_THRESHOLD, N, false, jsc)); Broadcast<Map<Integer,IntArrayList>> inNeighbors_BV = jsc.broadcast(inNeighbors); JavaPairRDD<Integer, Int2FloatLinkedOpenHashMap> topKneighborCandidates = getTopKNeighborSimsSUMWithScores(topKvalueCandidates, inNeighbors_BV, K); return topKneighborCandidates; }
/** * * @param topKvalueCandidates the topK results per entity, acquired from value similarity * @param rawTriples1 the rdf triples of the first entity collection * @param rawTriples2 the rdf triples of the second entity collection * @param SEPARATOR the delimiter that separates subjects, predicates and objects in the rawTriples1 and rawTriples2 files * @param entityIds1 the mapping of entity urls to entity ids, as it was used in blocking * @param entityIds2 * @param MIN_SUPPORT_THRESHOLD the minimum support threshold, below which, relations are discarded from top relations * @param K the K for topK candidate matches * @param N the N for topN rdf relations (and neighbors) * @param jsc the java spark context used to load files and broadcast variables * @return topK neighbor candidates per entity */ public JavaPairRDD<Integer, IntArrayList> run(JavaPairRDD<Integer,Int2FloatLinkedOpenHashMap> topKvalueCandidates, JavaRDD<String> rawTriples1, JavaRDD<String> rawTriples2, String SEPARATOR, JavaRDD<String> entityIds1, JavaRDD<String> entityIds2, float MIN_SUPPORT_THRESHOLD, int K, int N, JavaSparkContext jsc) { Map<Integer,IntArrayList> inNeighbors = new HashMap<>(new RelationsRank().run(rawTriples1, SEPARATOR, entityIds1, MIN_SUPPORT_THRESHOLD, N, true, jsc)); inNeighbors.putAll(new RelationsRank().run(rawTriples2, SEPARATOR, entityIds2, MIN_SUPPORT_THRESHOLD, N, false, jsc)); Broadcast<Map<Integer,IntArrayList>> inNeighbors_BV = jsc.broadcast(inNeighbors); //JavaPairRDD<Tuple2<Integer, Integer>, Float> neighborSims = getNeighborSims(topKvalueCandidates, inNeighbors_BV); //JavaPairRDD<Integer, IntArrayList> topKneighborCandidates = getTopKNeighborSimsOld(neighborSims, K); JavaPairRDD<Integer, IntArrayList> topKneighborCandidates = getTopKNeighborSims(topKvalueCandidates, inNeighbors_BV, K); return topKneighborCandidates; }
@Override public void runUpdate(JavaSparkContext sparkContext, long timestamp, JavaPairRDD<String,String> newData, JavaPairRDD<String,String> pastData, String modelDirString, TopicProducer<String,String> modelUpdateTopic) throws IOException { JavaPairRDD<String,String> allData = pastData == null ? newData : newData.union(pastData); String modelString; try { modelString = new ObjectMapper().writeValueAsString(countDistinctOtherWords(allData)); } catch (JsonProcessingException jpe) { throw new IOException(jpe); } modelUpdateTopic.send("MODEL", modelString); }
@Before public void setUp() { System.setProperty("hadoop.home.dir", "C:\\Users\\VASILIS\\Documents\\hadoop_home"); //only for local mode spark = SparkSession.builder() .appName("test") .config("spark.sql.warehouse.dir", "/file:/tmp") .config("spark.executor.instances", 1) .config("spark.executor.cores", 1) .config("spark.executor.memory", "1G") .config("spark.driver.maxResultSize", "1g") .config("spark.master", "local") .getOrCreate(); jsc = JavaSparkContext.fromSparkContext(spark.sparkContext()); }
@BeforeClass public static void createContext() throws IOException { Configuration hdfsConfig = HDFSUtils.getConfiguration(); SparkConf config = new SparkConf(); config.setMaster("local[*]"); config.setAppName("my JUnit running Spark"); sc = new JavaSparkContext(config); fileSystem = FileSystem.get(hdfsConfig); sqlContext = new SQLContext(sc); engine = new ParquetRepartEngine(fileSystem, sqlContext); }
private static MatrixFactorizationModel pmmlToMFModel(JavaSparkContext sparkContext, PMML pmml, Path modelParentPath, Broadcast<Map<String,Integer>> bUserIDToIndex, Broadcast<Map<String,Integer>> bItemIDToIndex) { String xPathString = AppPMMLUtils.getExtensionValue(pmml, "X"); String yPathString = AppPMMLUtils.getExtensionValue(pmml, "Y"); JavaPairRDD<String,float[]> userRDD = readFeaturesRDD(sparkContext, new Path(modelParentPath, xPathString)); JavaPairRDD<String,float[]> productRDD = readFeaturesRDD(sparkContext, new Path(modelParentPath, yPathString)); int rank = userRDD.first()._2().length; return new MatrixFactorizationModel( rank, readAndConvertFeatureRDD(userRDD, bUserIDToIndex), readAndConvertFeatureRDD(productRDD, bItemIDToIndex)); }
public static void main( String[] args ) { Dataset<Row> mutations = DataProvider.getMutationsToStructures(); List<String> pdbIds = mutations.select(col("pdbId")) .distinct().toJavaRDD().map(t -> t.getString(0)).collect(); List<Row> broadcasted = mutations.select("pdbId", "chainId", "pdbAtomPos").collectAsList(); SaprkUtils.stopSparkSession(); JavaSparkContext sc = SaprkUtils.getSparkContext(); Broadcast<List<Row>> bcmut = sc.broadcast(broadcasted); MmtfReader//.readSequenceFile("/pdb/2017/full", pdbIds, sc) .downloadMmtfFiles(Arrays.asList("5IRC"), sc) .flatMapToPair(new StructureToPolymerChains()) .flatMapToPair(new AddResidueToKey(bcmut)) .mapValues(new StructureToBioJava()) .mapToPair(new FilterResidue()) .filter(t -> t._2!=null).keys() .map(t -> t.replace(".", ",")) .saveAsTextFile("/Users/yana/git/mutantpdb/src/main/resources/pdb_residues"); sc.close(); }
public static DistributedMatrix[] GetLU(DistributedMatrix A, boolean diagonalInL, boolean diagonalInU, JavaSparkContext jsc) { if((diagonalInL && diagonalInU) || (!diagonalInL && !diagonalInU)) { LOG.error("Diagonal values must be in either upper or lower matrices"); System.exit(-1); } DistributedMatrix[] returnedMatrices; if( A.getClass() == IndexedRowMatrix.class) { returnedMatrices = OtherOperations.GetLU_IRW((IndexedRowMatrix) A, diagonalInL, diagonalInU, jsc); } else if (A.getClass() == CoordinateMatrix.class) { returnedMatrices = OtherOperations.GetLU_COORD((CoordinateMatrix) A, diagonalInL, diagonalInU, jsc); } else if (A.getClass() == BlockMatrix.class){ // TODO: Implement this operation //returnedMatrices = OtherOperations.GetLU_BCK((BlockMatrix) A, diagonalInL, diagonalInU, jsc); returnedMatrices = null; } else { returnedMatrices = null; } return returnedMatrices; }
public static DenseVector DGEMV(double alpha, DistributedMatrix A, DenseVector x, double beta, DenseVector y, JavaSparkContext jsc){ // First form y := beta*y. if (beta != 1.0) { if (beta == 0.0) { y = Vectors.zeros(y.size()).toDense(); } else { BLAS.scal(beta, y); } } if (alpha == 0.0) { return y; } DenseVector tmpVector = Vectors.zeros(y.size()).toDense(); // Form y := alpha*A*x + y. // Case of IndexedRowMatrix if( A.getClass() == IndexedRowMatrix.class) { tmpVector = L2.DGEMV_IRW((IndexedRowMatrix) A, alpha, x, jsc); } else if (A.getClass() == CoordinateMatrix.class) { tmpVector = L2.DGEMV_COORD((CoordinateMatrix) A, alpha, x, jsc); } else if (A.getClass() == BlockMatrix.class){ tmpVector = L2.DGEMV_BCK((BlockMatrix) A, alpha, x, jsc); } else { tmpVector = null; } BLAS.axpy(1.0, tmpVector, y); return y; }
private static DenseVector DGEMV_COORD(CoordinateMatrix matrix, double alpha, DenseVector vector, JavaSparkContext jsc) { JavaRDD<MatrixEntry> items = matrix.entries().toJavaRDD(); DenseVector result = items.mapPartitions(new MatrixEntriesMultiplication(vector, alpha)) .reduce(new MatrixEntriesMultiplicationReducer()); return result; }
public static void main(String[] args) { SparkConf sc = new SparkConf().setAppName("POC-Batch"); try(JavaSparkContext jsc = new JavaSparkContext(sc)) { JavaRDD<ExampleXML> records = jsc.wholeTextFiles("input/") .map(t -> t._2()) .map(new ParseXML()); System.out.printf("Amount of XMLs: %d\n", records.count()); } }
public DecisionTreeValidationSummary validateDecisionTreeAthenaFeatures(JavaSparkContext sc, FeatureConstraint featureConstraint, AthenaMLFeatureConfiguration athenaMLFeatureConfiguration, DecisionTreeDetectionModel decisionTreeDetectionModel, Indexing indexing, Marking marking) { long start = System.nanoTime(); // <-- start JavaPairRDD<Object, BSONObject> mongoRDD; mongoRDD = sc.newAPIHadoopRDD( mongodbConfig, // Configuration MongoInputFormat.class, // InputFormat: read from a live cluster. Object.class, // Key class BSONObject.class // Value class ); DecisionTreeDetectionAlgorithm decisionTreeDetectionAlgorithm = (DecisionTreeDetectionAlgorithm) decisionTreeDetectionModel.getDetectionAlgorithm(); DecisionTreeValidationSummary decisionTreeValidationSummary = new DecisionTreeValidationSummary(sc.sc(), decisionTreeDetectionAlgorithm.getNumClasses(), indexing, marking); DecisionTreeDistJob decisionTreeDistJob = new DecisionTreeDistJob(); decisionTreeDistJob.validate(mongoRDD, athenaMLFeatureConfiguration, decisionTreeDetectionModel, decisionTreeValidationSummary); long end = System.nanoTime(); // <-- start long time = end - start; decisionTreeValidationSummary.setTotalValidationTime(time); return decisionTreeValidationSummary; }
public GaussianMixtureValidationSummary validateGaussianMixtureAthenaFeatures(JavaSparkContext sc, FeatureConstraint featureConstraint, AthenaMLFeatureConfiguration athenaMLFeatureConfiguration, GaussianMixtureDetectionModel gaussianMixtureDetectionModel, Indexing indexing, Marking marking) { long start = System.nanoTime(); // <-- start JavaPairRDD<Object, BSONObject> mongoRDD; mongoRDD = sc.newAPIHadoopRDD( mongodbConfig, // Configuration MongoInputFormat.class, // InputFormat: read from a live cluster. Object.class, // Key class BSONObject.class // Value class ); GaussianMixtureDetectionAlgorithm gaussianMixtureDetectionAlgorithm = (GaussianMixtureDetectionAlgorithm) gaussianMixtureDetectionModel.getDetectionAlgorithm(); GaussianMixtureValidationSummary gaussianMixtureValidationSummary = new GaussianMixtureValidationSummary(sc.sc(), gaussianMixtureDetectionAlgorithm.getK(), indexing, marking); GaussianMixtureDistJob gaussianMixtureDistJob = new GaussianMixtureDistJob(); gaussianMixtureDistJob.validate(mongoRDD, athenaMLFeatureConfiguration, gaussianMixtureDetectionModel, gaussianMixtureValidationSummary); long end = System.nanoTime(); // <-- start long time = end - start; gaussianMixtureValidationSummary.setTotalValidationTime(time); return gaussianMixtureValidationSummary; }
public static void main( String[] args ){ String inputFile = "data/dummy.txt"; SparkConf configuration = new SparkConf().setMaster("local[4]").setAppName("My App"); JavaSparkContext sparkContext = new JavaSparkContext(configuration); JavaRDD<String> logData = sparkContext.textFile(inputFile).cache(); long numberA = logData.filter(new Function<String,Boolean>(){ private static final long serialVersionUID = 1L; public Boolean call(String s){ return s.length() == 0; } }).count(); sparkContext.close(); System.out.println("Empty Lines: " + numberA); }
public LassoValidationSummary validateLassoAthenaFeatures(JavaSparkContext sc, FeatureConstraint featureConstraint, AthenaMLFeatureConfiguration athenaMLFeatureConfiguration, LassoDetectionModel lassoDetectionModel, Indexing indexing, Marking marking) { long start = System.nanoTime(); // <-- start JavaPairRDD<Object, BSONObject> mongoRDD; mongoRDD = sc.newAPIHadoopRDD( mongodbConfig, // Configuration MongoInputFormat.class, // InputFormat: read from a live cluster. Object.class, // Key class BSONObject.class // Value class ); LassoDetectionAlgorithm lassoDetectionAlgorithm = (LassoDetectionAlgorithm) lassoDetectionModel.getDetectionAlgorithm(); LassoValidationSummary lassoValidationSummary = new LassoValidationSummary(); lassoValidationSummary.setLassoDetectionAlgorithm(lassoDetectionAlgorithm); LassoDistJob lassoDistJob = new LassoDistJob(); lassoDistJob.validate(mongoRDD, athenaMLFeatureConfiguration, lassoDetectionModel, lassoValidationSummary); long end = System.nanoTime(); // <-- start long time = end - start; lassoValidationSummary.setValidationTime(time); return lassoValidationSummary; }
public RidgeRegressionValidationSummary validateRidgeRegressionAthenaFeatures(JavaSparkContext sc, FeatureConstraint featureConstraint, AthenaMLFeatureConfiguration athenaMLFeatureConfiguration, RidgeRegressionDetectionModel ridgeRegressionDetectionModel, Indexing indexing, Marking marking) { long start = System.nanoTime(); // <-- start JavaPairRDD<Object, BSONObject> mongoRDD; mongoRDD = sc.newAPIHadoopRDD( mongodbConfig, // Configuration MongoInputFormat.class, // InputFormat: read from a live cluster. Object.class, // Key class BSONObject.class // Value class ); RidgeRegressionDetectionAlgorithm ridgeRegressionDetectionAlgorithm = (RidgeRegressionDetectionAlgorithm) ridgeRegressionDetectionModel.getDetectionAlgorithm(); RidgeRegressionValidationSummary ridgeRegressionValidationSummary = new RidgeRegressionValidationSummary(); ridgeRegressionValidationSummary.setRidgeRegressionDetectionAlgorithm(ridgeRegressionDetectionAlgorithm); RidgeRegressionDistJob ridgeRegressionDistJob = new RidgeRegressionDistJob(); ridgeRegressionDistJob.validate(mongoRDD, athenaMLFeatureConfiguration, ridgeRegressionDetectionModel, ridgeRegressionValidationSummary); long end = System.nanoTime(); // <-- start long time = end - start; ridgeRegressionValidationSummary.setValidationTime(time); return ridgeRegressionValidationSummary; }
private static void splitFastq(FileStatus fst, String fqPath, String splitDir, int splitlen, JavaSparkContext sc) throws IOException { Path fqpath = new Path(fqPath); String fqname = fqpath.getName(); String[] ns = fqname.split("\\."); List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen); JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif); splitRDD.foreach( split -> { FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), split); writeFastqFile(fqreader, new Configuration(), splitDir + "/split_" + split.getStart() + "." + ns[1]); }); }
public static SparkUtils getInstance() { SparkUtils result = instance; if (result == null) { synchronized (SparkUtils.class) { result = instance; if (result == null) { instance = result = new SparkUtils(); SparkConf sparkConf = new SparkConf().setMaster("local").setAppName("Explainer"); jsc = new JavaSparkContext(sparkConf); sqlContext = new SQLContext(jsc); } } } return result; }
@Override public void publishAdditionalModelData(JavaSparkContext sparkContext, PMML pmml, JavaRDD<String> newData, JavaRDD<String> pastData, Path modelParentPath, TopicProducer<String, String> modelUpdateTopic) { // Send item updates first, before users. That way, user-based endpoints like /recommend // may take longer to not return 404, but when they do, the result will be more complete. log.info("Sending item / Y data as model updates"); String yPathString = AppPMMLUtils.getExtensionValue(pmml, "Y"); JavaPairRDD<String,float[]> productRDD = readFeaturesRDD(sparkContext, new Path(modelParentPath, yPathString)); String updateBroker = modelUpdateTopic.getUpdateBroker(); String topic = modelUpdateTopic.getTopic(); // For now, there is no use in sending known users for each item productRDD.foreachPartition(new EnqueueFeatureVecsFn("Y", updateBroker, topic)); log.info("Sending user / X data as model updates"); String xPathString = AppPMMLUtils.getExtensionValue(pmml, "X"); JavaPairRDD<String,float[]> userRDD = readFeaturesRDD(sparkContext, new Path(modelParentPath, xPathString)); if (noKnownItems) { userRDD.foreachPartition(new EnqueueFeatureVecsFn("X", updateBroker, topic)); } else { log.info("Sending known item data with model updates"); JavaRDD<String[]> allData = (pastData == null ? newData : newData.union(pastData)).map(MLFunctions.PARSE_FN); JavaPairRDD<String,Collection<String>> knownItems = knownsRDD(allData, true); userRDD.join(knownItems).foreachPartition( new EnqueueFeatureVecsAndKnownItemsFn("X", updateBroker, topic)); } }
public NaiveBayesValidationSummary validateNaiveBayesAthenaFeatures(JavaSparkContext sc, FeatureConstraint featureConstraint, AthenaMLFeatureConfiguration athenaMLFeatureConfiguration, NaiveBayesDetectionModel naiveBayesDetectionModel, Indexing indexing, Marking marking) { long start = System.nanoTime(); // <-- start JavaPairRDD<Object, BSONObject> mongoRDD; mongoRDD = sc.newAPIHadoopRDD( mongodbConfig, // Configuration MongoInputFormat.class, // InputFormat: read from a live cluster. Object.class, // Key class BSONObject.class // Value class ); NaiveBayesDetectionAlgorithm naiveBayesDetectionAlgorithm = (NaiveBayesDetectionAlgorithm) naiveBayesDetectionModel.getDetectionAlgorithm(); NaiveBayesValidationSummary naiveBayesValidationSummary = new NaiveBayesValidationSummary(sc.sc(), naiveBayesDetectionAlgorithm.getNumClasses(), indexing, marking); NaiveBayesDistJob naiveBayesDistJob = new NaiveBayesDistJob(); naiveBayesDistJob.validate(mongoRDD, athenaMLFeatureConfiguration, naiveBayesDetectionModel, naiveBayesValidationSummary); long end = System.nanoTime(); // <-- start long time = end - start; naiveBayesValidationSummary.setTotalValidationTime(time); return naiveBayesValidationSummary; }
public GradientBoostedTreesValidationSummary validateGradientBoostedTreesAthenaFeatures(JavaSparkContext sc, FeatureConstraint featureConstraint, AthenaMLFeatureConfiguration athenaMLFeatureConfiguration, GradientBoostedTreesDetectionModel gradientBoostedTreesDetectionModel, Indexing indexing, Marking marking) { long start = System.nanoTime(); // <-- start JavaPairRDD<Object, BSONObject> mongoRDD; mongoRDD = sc.newAPIHadoopRDD( mongodbConfig, // Configuration MongoInputFormat.class, // InputFormat: read from a live cluster. Object.class, // Key class BSONObject.class // Value class ); GradientBoostedTreesDetectionAlgorithm gradientBoostedTreesDetectionAlgorithm = (GradientBoostedTreesDetectionAlgorithm) gradientBoostedTreesDetectionModel.getDetectionAlgorithm(); GradientBoostedTreesValidationSummary gradientBoostedTreesValidationSummary = new GradientBoostedTreesValidationSummary(sc.sc(), gradientBoostedTreesDetectionAlgorithm.getNumClasses(), indexing, marking); GradientBoostedTreesDistJob gradientBoostedTreesDistJob = new GradientBoostedTreesDistJob(); gradientBoostedTreesDistJob.validate(mongoRDD, athenaMLFeatureConfiguration, gradientBoostedTreesDetectionModel, gradientBoostedTreesValidationSummary); long end = System.nanoTime(); // <-- start long time = end - start; gradientBoostedTreesValidationSummary.setTotalValidationTime(time); return gradientBoostedTreesValidationSummary; }
@Provides CassandraIo<RawRating> providesCassandraRatingIO(JavaSparkContext sparkContext) { if (ratingCassandraIo != null) { return ratingCassandraIo; } ratingCassandraIo = new CassandraIo<>(RawRating.class, "dev", "ratings"); ratingCassandraIo.setSparkContext(sparkContext); return ratingCassandraIo; }
public static void main(String[] args) throws ParseException { final Validator validator = new Validator(args); ValidatorParameters params = validator.getParameters(); validator.setDoPrintInProcessRecord(false); logger.info("Input file is " + params.getArgs()); SparkConf conf = new SparkConf().setAppName("MarcCompletenessCount"); JavaSparkContext context = new JavaSparkContext(conf); System.err.println(validator.getParameters().formatParameters()); JavaRDD<String> inputFile = context.textFile(validator.getParameters().getArgs()[0]); JavaRDD<String> baseCountsRDD = inputFile .flatMap(content -> { MarcReader reader = ReadMarc.getMarcStringReader(content); Record marc4jRecord = reader.next(); MarcRecord marcRecord = MarcFactory.createFromMarc4j( marc4jRecord, params.getDefaultRecordType(), params.getMarcVersion(), params.fixAlephseq()); validator.processRecord(marcRecord, 1); return ValidationErrorFormatter .formatForSummary(marcRecord.getValidationErrors(), params.getFormat()) .iterator(); } ); baseCountsRDD.saveAsTextFile(validator.getParameters().getFileName()); }
public LinearRegressionValidationSummary validateLinearRegressionAthenaFeatures(JavaSparkContext sc, FeatureConstraint featureConstraint, AthenaMLFeatureConfiguration athenaMLFeatureConfiguration, LinearRegressionDetectionModel linearRegressionDetectionModel, Indexing indexing, Marking marking) { long start = System.nanoTime(); // <-- start JavaPairRDD<Object, BSONObject> mongoRDD; mongoRDD = sc.newAPIHadoopRDD( mongodbConfig, // Configuration MongoInputFormat.class, // InputFormat: read from a live cluster. Object.class, // Key class BSONObject.class // Value class ); LinearRegressionDetectionAlgorithm linearRegressionDetectionAlgorithm = (LinearRegressionDetectionAlgorithm) linearRegressionDetectionModel.getDetectionAlgorithm(); LinearRegressionValidationSummary linearRegressionValidationSummary = new LinearRegressionValidationSummary(); linearRegressionValidationSummary.setLinearRegressionDetectionAlgorithm(linearRegressionDetectionAlgorithm); LinearRegressionDistJob linearRegressionDistJob = new LinearRegressionDistJob(); linearRegressionDistJob.validate(mongoRDD, athenaMLFeatureConfiguration, linearRegressionDetectionModel, linearRegressionValidationSummary); long end = System.nanoTime(); // <-- start long time = end - start; linearRegressionValidationSummary.setValidationTime(time); return linearRegressionValidationSummary; }
/** * Parse RDF file from resources folder * @param sc spark context to use for parsing * @param fileName name of the file to parse * @return RDD of quads from the requested file */ public static JavaRDD<Quad> getQuadsRDD(JavaSparkContext sc, String fileName) { QuadParser parser = new ElephasQuadParser( new QuadParserConfig() .setBatchSize(2), sc ); String path = TestUtils.getDatasetPath(fileName); return parser.parseQuads(path); }