/** * Gets the data frame size. * * @param dataFrame * the data frame * @return the data frame size */ public int getDataFrameSize(DataFrame dataFrame) { long l = dataFrame != null ? dataFrame.count() : 0; if (l < Integer.MIN_VALUE || l > Integer.MAX_VALUE) { logger.error(l + " cannot be cast to int without changing its value."); return 0; } return (int) l; }
CassandraSQLContext sqlContext = new CassandraSQLContext(sc); String query = "SELECT * FROM testing.cf_text where id='Open' and date IN ('2015-01-21','2015-01-22')"; DataFrame resultsFrame = sqlContext.sql(query); System.out.println(resultsFrame.count());
JavaRDD<TestClass> dates = sc.textFile("hdfs://0.0.0.0:19000/Dates.csv").map( new Function<String, TestClass>(){ @Override public TestClass call(String line){ String[] fields = line.split(","); TestClass tc = new TestClass(); tc.setDate(Date.parse(fields[2])); return tc; } }); DataFrame schemaTransactions = sqlContext.createDataFrame(dates, TestClass.class); schemaTransactions.registerTempTable("dates"); DataFrame dAs = sqlContext.sql("SELECT * FROM dates"); dAs.count();
long n = input.count(); System.out.println(" Number of sentences = " + n); System.out.println(" Total tagging time = " + duration + " milliseconds.");
void testRandomSplit(String inputFileName, int numFeatures, String modelFileName) { CMMParams params = new CMMParams() .setMaxIter(600) .setRegParam(1E-6) .setMarkovOrder(2) .setNumFeatures(numFeatures); JavaRDD<String> lines = jsc.textFile(inputFileName); DataFrame dataset = createDataFrame(lines.collect()); DataFrame[] splits = dataset.randomSplit(new double[]{0.9, 0.1}); DataFrame trainingData = splits[0]; System.out.println("Number of training sequences = " + trainingData.count()); DataFrame testData = splits[1]; System.out.println("Number of test sequences = " + testData.count()); // train and save a model on the training data cmmModel = train(trainingData, modelFileName, params); // test the model on the test data System.out.println("Test accuracy:"); evaluate(testData); // test the model on the training data System.out.println("Training accuracy:"); evaluate(trainingData); }
public static void main( String[] args ) { // SparkConf conf = new SparkConf().setAppName("App-mt").setMaster("local[2]"); // SparkConf conf = new SparkConf().setAppName("App-mt").setMaster("spark://Kavithas-MBP.home:7077"); SparkConf conf = new SparkConf().setAppName("App-mt").setMaster("spark://kavithas-mbp.watson.ibm.com:7077"); JavaSparkContext sc = new JavaSparkContext(conf); HiveContext sqlContext = new HiveContext(sc.sc()); DataFrame urls = sqlContext.read().json("/tmp/urls.json"); urls.registerTempTable("urls"); DataFrame temp = sqlContext.sql("select * from urls"); temp.show(); sqlContext.sql("add jar /tmp/quetzal.jar"); sqlContext.sql("create temporary function webservice as 'com.ibm.research.rdf.store.utilities.WebServiceGetUDTF'"); DataFrame drugs = sqlContext.sql("select webservice(\"drug,id,action\", \"url\", \"\", \"GET\", \"xs=http://www.w3.org/2001/XMLSchema\", \"//row\",\"drug\",\"./drug\"," + " \"<string>\", \"id\", \"./id\",\"<string>\", \"action\", \"./action\", \"<string>\", url) as (drug, drug_typ, id, id_typ, action, action_typ) from urls"); drugs.show(); System.out.println("Num rows:" + drugs.count()); }
DataFrame df1 = model.transform(df0); prediction = jsc.broadcast(df1.select("prediction").collect()); if (df1.count() > 0) { output = s.map(new WhitespaceClassificationFunction());
MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator().setMetricName("precision"); if (verbose) { System.out.println("N = " + trainingData.count()); System.out.println("D = " + vocabSize); System.out.println("K = " + numLabels);
DataFrame df = sqlContext.createDataFrame(jrdd, WhitespaceContext.class); df.show(false); System.out.println("N = " + df.count()); df.groupBy("label").count().show();