void defaultSearchGLM(Key<Grid> gridKey) { Algo algo = Algo.GLM; WorkAllocations.Work work = workAllocations.getAllocation(algo, JobType.HyperparamSearch); if (work == null) return; GLMParameters glmParameters = new GLMParameters(); setCommonModelBuilderParams(glmParameters); glmParameters._lambda_search = true; glmParameters._family = getResponseColumn().isBinary() && !(getResponseColumn().isNumeric()) ? GLMParameters.Family.binomial : getResponseColumn().isCategorical() ? GLMParameters.Family.multinomial : GLMParameters.Family.gaussian; // TODO: other continuous distributions! Map<String, Object[]> searchParams = new HashMap<>(); glmParameters._alpha = new double[] {0.0, 0.2, 0.4, 0.6, 0.8, 1.0}; // Note: standard GLM parameter is an array; don't use searchParams! // NOTE: removed MissingValuesHandling.Skip for now because it's crashing. See https://0xdata.atlassian.net/browse/PUBDEV-4974 searchParams.put("_missing_values_handling", new DeepLearningParameters.MissingValuesHandling[] {DeepLearningParameters.MissingValuesHandling.MeanImputation /* , DeepLearningModel.DeepLearningParameters.MissingValuesHandling.Skip */}); Job<Grid> glmJob = hyperparameterSearch(gridKey, work, glmParameters, searchParams); pollAndUpdateProgress(Stage.ModelTraining, "GLM hyperparameter search", work, this.job(), glmJob); }
@Test public void testAllNAs() { Key raw = Key.make("gamma_test_data_raw"); Key parsed = Key.make("gamma_test_data_parsed"); FVecTest.makeByteVec(raw, "x,y,z\n1,0,NA\n2,NA,1\nNA,3,2\n4,3,NA\n5,NA,1\nNA,6,4\n7,NA,9\n8,NA,18\nNA,9,23\n10,31,NA\nNA,11,20\n12,NA,25\nNA,13,37\n14,45,NA\n"); Frame fr = ParseDataset.parse(parsed, raw); GLM job = null; try { GLMParameters params = new GLMParameters(Family.poisson); // params._response = 1; params._response_column = fr._names[1]; params._train = parsed; params._lambda = new double[]{0}; params._missing_values_handling = MissingValuesHandling.Skip; GLM glm = new GLM( params); glm.trainModel().get(); assertFalse("should've thrown IAE", true); } catch (IllegalArgumentException e) { assertTrue(e.getMessage(), e.getMessage().contains("No rows left in the dataset")); } finally { fr.delete(); } }
@Test public void testSingleCatNoIcpt(){ Vec cat = Vec.makeVec(new long[]{1,1,1,0,0},new String[]{"black","red"},Vec.newKey()); Vec res = Vec.makeVec(new double[]{1,1,0,0,0},cat.group().addVec()); Frame fr = new Frame(Key.<Frame>make("fr"),new String[]{"x","y"},new Vec[]{cat,res}); DKV.put(fr); GLMParameters parms = new GLMParameters(); parms._train = fr._key; parms._alpha = new double[]{0}; parms._response_column = "y"; parms._intercept = false; parms._family = Family.binomial; // just make sure it runs GLMModel model = new GLM(parms).trainModel().get(); Map<String,Double> coefs = model.coefficients(); System.out.println("coefs = " + coefs); Assert.assertEquals(coefs.get("Intercept"),0,0); Assert.assertEquals(4.2744474,((GLMMetrics)model._output._training_metrics).residual_deviance(),1e-4); System.out.println(); model.delete(); fr.delete(); }
@Test @Ignore public void testConstantColumns(){ GLMModel model1 = null, model2 = null, model3 = null, model4 = null; Frame fr = parse_test_file(Key.make("Airlines"), "smalldata/airlines/allyears2k_headers.zip"); Vec y = fr.vec("IsDepDelayed").makeCopy(null); fr.replace(fr.find("IsDepDelayed"),y).remove(); Vec weights = fr.anyVec().makeZero(); new MRTask(){ @Override public void map(Chunk c){ int i = 0; for(i = 0; i < c._len; ++i){ long rid = c.start()+i; if(rid >= 1999) break; c.set(i,1); } } }.doAll(weights); fr.add("weights", weights); DKV.put(fr); GLMParameters parms = new GLMParameters(Family.gaussian); parms._train = fr._key; parms._weights_column = "weights"; parms._lambda_search = true; parms._alpha = new double[]{0}; parms._response_column = "IsDepDelayed"; parms._ignored_columns = new String[]{"DepTime", "ArrTime", "Cancelled", "CancellationCode", "DepDelay", "Diverted", "CarrierDelay", "WeatherDelay", "NASDelay", "SecurityDelay", "LateAircraftDelay", "IsArrDelayed"}; parms._standardize = true; model1 = new GLM(parms).trainModel().get(); model1.delete(); fr.delete(); }
@Test public void testGaussianRegression() throws InterruptedException, ExecutionException { Key raw = Key.make("gaussian_test_data_raw"); Key parsed = Key.make("gaussian_test_data_parsed"); GLMModel model = null; Frame fr = null, res = null; try { // make data so that the expected coefficients is icept = col[0] = 1.0 FVecTest.makeByteVec(raw, "x,y\n0,0\n1,0.1\n2,0.2\n3,0.3\n4,0.4\n5,0.5\n6,0.6\n7,0.7\n8,0.8\n9,0.9"); fr = ParseDataset.parse(parsed, raw); GLMParameters params = new GLMParameters(Family.gaussian); params._train = fr._key; // params._response = 1; params._response_column = fr._names[1]; params._lambda = new double[]{0}; // params._standardize= false; model = new GLM(params).trainModel().get(); HashMap<String, Double> coefs = model.coefficients(); assertEquals(0.0, coefs.get("Intercept"), 1e-4); assertEquals(0.1, coefs.get("x"), 1e-4); testScoring(model,fr); } finally { if (fr != null) fr.remove(); if (res != null) res.remove(); if (model != null) model.remove(); } }
@Test public void testCoordinateDescent_airlines_CovUpdates() { GLMModel model = null; Key parsed = Key.make("airlines_parsed"); Key<GLMModel> modelKey = Key.make("airlines_model"); Frame fr = parse_test_file(parsed, "smalldata/airlines/AirlinesTrain.csv.zip"); try { // H2O differs on intercept and race, same residual deviance though GLMParameters params = new GLMParameters(); params._standardize = true; params._family = Family.binomial; params._solver = Solver.COORDINATE_DESCENT; params._response_column = "IsDepDelayed"; params._ignored_columns = new String[]{"IsDepDelayed_REC"}; params._train = fr._key; GLM glm = new GLM( params, modelKey); model = glm.trainModel().get(); assertTrue(glm.isStopped()); System.out.println(model._output._training_metrics); } finally { fr.delete(); if (model != null) model.delete(); } }
@Test public void testCoordinateDescent_airlines() { GLMModel model = null; Key parsed = Key.make("airlines_parsed"); Key<GLMModel> modelKey = Key.make("airlines_model"); Frame fr = parse_test_file(parsed, "smalldata/airlines/AirlinesTrain.csv.zip"); try { // H2O differs on intercept and race, same residual deviance though GLMParameters params = new GLMParameters(); params._standardize = true; params._family = Family.binomial; params._solver = Solver.COORDINATE_DESCENT_NAIVE; params._response_column = "IsDepDelayed"; params._ignored_columns = new String[]{"IsDepDelayed_REC"}; params._train = fr._key; GLM glm = new GLM( params, modelKey); model = glm.trainModel().get(); assertTrue(glm.isStopped()); System.out.println(model._output._training_metrics); } finally { fr.delete(); if (model != null) model.delete(); } }
@Test public void testCornerCases() { // new GLM2("GLM testing constant offset on a toy dataset.", Key.make(), modelKey, new GLM2.Source(fr, fr.vec("D"), false, false, fr.vec("E")), Family.gaussian).setRegularization(new double[]{0}, new double[]{0}).doInit().fork().get(); // just test it does not blow up and the model is sane // model = DKV.get(modelKey).get(); // assertEquals(model.coefficients().get("E"), 1, 0); // should be exactly 1 GLMParameters parms = new GLMParameters(Family.gaussian); parms._response_column = "D"; parms._offset_column = "E"; parms._train = _abcd._key; parms._intercept = false; parms._standardize = false; GLMModel m = null; for(Solver s:new Solver[]{Solver.IRLSM,Solver.COORDINATE_DESCENT}) { parms._solver = s; try { m = new GLM(parms).trainModel().get(); GLMTest.testScoring(m, _abcd); System.out.println(m.coefficients()); } finally { if (m != null) m.delete(); } } }
@Test public void testXval(){ GLMModel model = null; Frame fr = parse_test_file("smalldata/glm_test/prostate_cat_replaced.csv"); try{ GLMParameters params = new GLMParameters(Family.binomial); params._response_column = "CAPSULE"; params._ignored_columns = new String[]{"ID"}; params._train = fr._key; params._lambda_search = true; params._nfolds = 3; params._standardize = false; params._keep_cross_validation_models = true; GLM glm = new GLM(params); model = glm.trainModel().get(); } finally { fr.delete(); if(model != null) { for(Key k:model._output._cross_validation_models) Keyed.remove(k); model.delete(); } } }
@Test public void testCoordinateDescent_anomaly_CovUpdates() { GLMModel model = null; Key parsed = Key.make("anomaly_parsed"); Key<GLMModel> modelKey = Key.make("anomaly_model"); Frame fr = parse_test_file(parsed, "smalldata/anomaly/ecg_discord_train.csv"); try { // H2O differs on intercept and race, same residual deviance though GLMParameters params = new GLMParameters(); params._standardize = true; params._family = Family.gaussian; params._solver = Solver.COORDINATE_DESCENT; params._response_column = "C1"; params._train = fr._key; GLM glm = new GLM( params, modelKey); model = glm.trainModel().get(); assertTrue(glm.isStopped()); System.out.println(model._output._training_metrics); } finally { fr.delete(); if (model != null) model.delete(); } }
@Test public void testCoordinateDescent_anomaly() { GLMModel model = null; Key parsed = Key.make("anomaly_parsed"); Key<GLMModel> modelKey = Key.make("anomaly_model"); Frame fr = parse_test_file(parsed, "smalldata/anomaly/ecg_discord_train.csv"); try { // H2O differs on intercept and race, same residual deviance though GLMParameters params = new GLMParameters(); params._standardize = true; params._family = Family.gaussian; params._solver = Solver.COORDINATE_DESCENT_NAIVE; params._response_column = "C1"; params._train = fr._key; GLM glm = new GLM( params, modelKey); model = glm.trainModel().get(); assertTrue(glm.isStopped()); System.out.println(model._output._training_metrics); } finally { fr.delete(); if (model != null) model.delete(); } }
@Test public void testCitibikeReproPUBDEV1953() throws Exception { GLMModel model = null; Frame tfr = parse_test_file("smalldata/glm_test/citibike_small_train.csv"); Frame vfr = parse_test_file("smalldata/glm_test/citibike_small_test.csv"); try { Scope.enter(); GLMParameters params = new GLMParameters(Family.poisson); params._response_column = "bikes"; params._train = tfr._key; params._valid = vfr._key; params._family = Family.poisson; GLM glm = new GLM( params); model = glm.trainModel().get(); testScoring(model,vfr); } finally { tfr.remove(); vfr.remove(); if(model != null)model.delete(); Scope.exit(); } }
@Test public void testZeroedColumn(){ Vec x = Vec.makeCon(Vec.newKey(),1,2,3,4,5); Vec y = Vec.makeCon(x.group().addVec(),0,1,0,1,0); Vec z = Vec.makeCon(Vec.newKey(),1,2,3,4,5); Vec w = Vec.makeCon(x.group().addVec(),1,0,1,0,1); Frame fr = new Frame(Key.<Frame>make("test"),new String[]{"x","y","z","w"},new Vec[]{x,y,z,w}); DKV.put(fr); GLMParameters parms = new GLMParameters(Family.gaussian); parms._train = fr._key; parms._lambda = new double[]{0}; parms._alpha = new double[]{0}; parms._compute_p_values = true; parms._response_column = "z"; parms._weights_column = "w"; GLMModel m = new GLM(parms).trainModel().get(); System.out.println(m.coefficients()); m.delete(); fr.delete(); } @Test
@Test //PUBDEV-1839 public void testCitibikeReproPUBDEV1839() throws Exception { GLMModel model = null; Frame tfr = parse_test_file("smalldata/jira/pubdev_1839_repro_train.csv"); Frame vfr = parse_test_file("smalldata/jira/pubdev_1839_repro_test.csv"); try { Scope.enter(); GLMParameters params = new GLMParameters(Family.poisson); params._response_column = "bikes"; params._train = tfr._key; params._valid = vfr._key; GLM glm = new GLM(params); model = glm.trainModel().get(); testScoring(model,vfr); } finally { tfr.remove(); vfr.remove(); if(model != null)model.delete(); Scope.exit(); } }
@Test public void testAbalone() { Scope.enter(); GLMModel model = null; try { Frame fr = parse_test_file("smalldata/glm_test/Abalone.gz"); Scope.track(fr); GLMParameters params = new GLMParameters(Family.gaussian); params._train = fr._key; params._response_column = fr._names[8]; params._alpha = new double[]{1.0}; params._lambda_search = true; GLM glm = new GLM(params); model = glm.trainModel().get(); testScoring(model,fr); } finally { if( model != null ) model.delete(); Scope.exit(); } }
public GLM(boolean startup_once){super(new GLMParameters(),startup_once);} public GLM(GLMModel.GLMParameters parms) {
public static Model.Parameters createParameters(MetalearnerAlgorithm algo) { switch (algo) { case deeplearning: return new DeepLearningModel.DeepLearningParameters(); case drf: return new DRFModel.DRFParameters(); case gbm: return new GBMModel.GBMParameters(); case glm: case AUTO: default: return new GLMModel.GLMParameters(); } }
@Test public void testNaiveCoordinateDescent_families() { GLMParameters params = new GLMParameters(Family.binomial); params._solver = Solver.COORDINATE_DESCENT_NAIVE; final Family[] families = {Family.binomial, Family.gaussian, Family.gamma, Family.tweedie, Family.poisson, Family.ordinal, Family.quasibinomial}; GLMParameters.Link[] linkingfuncs = {GLMParameters.Link.logit, GLMParameters.Link.identity, GLMParameters.Link.log, GLMParameters.Link.tweedie, GLMParameters.Link.log, GLMParameters.Link.ologit, GLMParameters.Link.logit}; for (int i = 0; i < families.length; i++) { params._family = families[i]; params._link = linkingfuncs[i]; new GLM(params); } }
private GLMModel prepareGLMModel(String dataset, String[] ignoredColumns, String response, GLMModel.GLMParameters.Family family) { Frame f = parse_test_file(dataset); try { GLMModel.GLMParameters params = new GLMModel.GLMParameters(); params._train = f._key; params._ignored_columns = ignoredColumns; params._response_column = response; params._family = family; return new GLM(params).trainModel().get(); } finally { if (f!=null) f.delete(); } }
@Test public void testNaiveCoordinateDescent() { expectedException.expect(H2OIllegalArgumentException.class); expectedException.expectMessage("Naive coordinate descent is not supported for multinomial."); GLMParameters params = new GLMParameters(Family.multinomial); params._solver = Solver.COORDINATE_DESCENT_NAIVE; // Should throw exception with information about unsupported message new GLM(params); }