aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorjayunit100 <jayunit100@gmail.com>2014-04-21 08:56:57 -0400
committerjayunit100 <jay@apache.org>2015-01-15 21:35:08 -0500
commit3b13a811a8c6ad568a6a3a7b586be9f2fdf4e810 (patch)
treedd2660e7582067a5be845c1d63ea58032c1094d6
parentb4a7a7a47c2ab582af93c5d267b2970f4555f934 (diff)
BIGTOP-1287. Mahout smokes : Remove dirchlet/meanshift clustering.
-rw-r--r--bigtop-tests/test-artifacts/mahout/src/main/groovy/org/apache/bigtop/itest/mahout/smoke/TestMahoutExamples.groovy268
1 files changed, 129 insertions, 139 deletions
diff --git a/bigtop-tests/test-artifacts/mahout/src/main/groovy/org/apache/bigtop/itest/mahout/smoke/TestMahoutExamples.groovy b/bigtop-tests/test-artifacts/mahout/src/main/groovy/org/apache/bigtop/itest/mahout/smoke/TestMahoutExamples.groovy
index 9e50350b..63f07be2 100644
--- a/bigtop-tests/test-artifacts/mahout/src/main/groovy/org/apache/bigtop/itest/mahout/smoke/TestMahoutExamples.groovy
+++ b/bigtop-tests/test-artifacts/mahout/src/main/groovy/org/apache/bigtop/itest/mahout/smoke/TestMahoutExamples.groovy
@@ -1,20 +1,20 @@
/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- * <p/>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements. See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership. The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License. You may obtain a copy of the License at
+* <p>
+* http://www.apache.org/licenses/LICENSE-2.0
+* <p>
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
package org.apache.bigtop.itest.mahout.smoke;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.assertEquals;
@@ -28,17 +28,17 @@ import org.apache.bigtop.itest.JarContent;
import org.apache.bigtop.itest.shell.Shell;
/**
- * Test Mahout examples shipped with the distribution.
- */
+* Test Mahout examples shipped with the distribution.
+*/
public class TestMahoutExamples {
public static final String TEMP_DIR = "/tmp/mahout.${(new Date().getTime())}";
public static final String WORK_DIR = TEMP_DIR;
/**
- * If MAHOUT_HOME is supplied, use that as the executable. Else, use
- * mahout. This eases the testing of tarball installations and other scenarios
- * where possible more than one version of an ecosystem component is available.
- */
+ * If MAHOUT_HOME is supplied, use that as the executable. Else, use
+ * mahout. This eases the testing of tarball installations and other scenarios
+ * where possible more than one version of an ecosystem component is available.
+ */
public static String MAHOUT_HOME = System.getenv("MAHOUT_HOME") ;
public static String MAHOUT = MAHOUT_HOME ? MAHOUT_HOME+"/bin/mahout":"mahout"
@@ -48,68 +48,68 @@ public class TestMahoutExamples {
/**
* Mahout smokes rely on a lot of external files. So we
- * modularize the downloads into a single function, so that
- * the setup is easier to debug. If any download results in a
+ * modularize the downloads into a single function, so that
+ * the setup is easier to debug. If any download results in a
* small file (i.e. due to 404 or 500 error), assertion will fail
- * before the smokes actually start.
+ * before the smokes actually start.
*/
public static void download(){
- //key value pairs : data file -> url that file resides on.
- def urlmap = [
- "20news-bydate.tar.gz":
- "http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz" ,
-
- "reuters21578.tar.gz":
- "http://kdd.ics.uci.edu/databases/reuters21578/reuters21578.tar.gz",
-
- "synthetic_control.data":
- "http://archive.ics.uci.edu/ml/databases/synthetic_control/synthetic_control.data",
-
- "ml-1m.zip":
- "http://files.grouplens.org/papers/ml-1m.zip"
- ];
- //For each url above, download it.
+ //key value pairs : data file -&gt; url that file resides on.
+ def urlmap = [
+ "20news-bydate.tar.gz":
+ "http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz" ,
+
+ "reuters21578.tar.gz":
+ "http://kdd.ics.uci.edu/databases/reuters21578/reuters21578.tar.gz",
+
+ "synthetic_control.data":
+ "http://archive.ics.uci.edu/ml/databases/synthetic_control/synthetic_control.data",
+
+ "ml-1m.zip":
+ "http://files.grouplens.org/papers/ml-1m.zip"
+ ];
+ //For each url above, download it.
urlmap.each() {
- f_name,loc ->
- sh.exec("if [ ! -f ${download_dir}/${f_name} ]; then " +
- "curl ${loc} -o ${download_dir}/${f_name}; " +
- "fi");
- File file = new File("${download_dir}/${f_name}");
-
- assertTrue("file "+ f_name + " at "+loc + " len=" + file.length() + " is > 5k bytes", file.length() > 5000 );
+ f_name,loc -&gt;
+ sh.exec("if [ ! -f ${download_dir}/${f_name} ]; then " +
+ "curl ${loc} -o ${download_dir}/${f_name}; " +
+ "fi");
+ File file = new File("${download_dir}/${f_name}");
+
+ assertTrue("file "+ f_name + " at "+loc + " len=" + file.length() + " is &gt; 5k bytes", file.length() &gt; 5000 );
}
}
- /**
- * Individual tests (i.e. movie lens factorizer) will selectively copy this directory into the
- * distributed file system & then run tests against it (i.e. movie lens factorizer uses "fs -put" after
- * formatting a csv file in the tmp dir).
- */
+ /**
+ * Individual tests (i.e. movie lens factorizer) will selectively copy this directory into the
+ * distributed file system & then run tests against it (i.e. movie lens factorizer uses "fs -put" after
+ * formatting a csv file in the tmp dir).
+ */
@BeforeClass
public static void setUp() {
- download();
+ download();
// uncompress archives
sh.exec("mkdir ${TEMP_DIR}",
- "cd ${TEMP_DIR}",
+ "cd ${TEMP_DIR}",
//Create news-date data dir :: input for classifier test
- "mkdir 20news-bydate",
- "cd 20news-bydate",
- "tar xzf ${download_dir}/20news-bydate.tar.gz",
- "cd ..",
- //Create news-all data directory :: input for LDA test
- "mkdir 20news-all",
- "cp -R 20news-bydate/*/* 20news-all",
- "mkdir reuters-sgm",
- "cd reuters-sgm",
- "tar xzf ${download_dir}/reuters21578.tar.gz",
- "cd ..",
- //Create movie lens data directory :: input data for movie recommender test
- "mkdir movielens",
- "cd movielens",
- "unzip ${download_dir}/ml-1m.zip");
+ "mkdir 20news-bydate",
+ "cd 20news-bydate",
+ "tar xzf ${download_dir}/20news-bydate.tar.gz",
+ "cd ..",
+ //Create news-all data directory :: input for LDA test
+ "mkdir 20news-all",
+ "cp -R 20news-bydate/*/* 20news-all",
+ "mkdir reuters-sgm",
+ "cd reuters-sgm",
+ "tar xzf ${download_dir}/reuters21578.tar.gz",
+ "cd ..",
+ //Create movie lens data directory :: input data for movie recommender test
+ "mkdir movielens",
+ "cd movielens",
+ "unzip ${download_dir}/ml-1m.zip");
assertEquals("Failed to uncompress archives", 0, sh.getRet());
sh.exec("hadoop fs -mkdir ${WORK_DIR}");
assertEquals("Unable to create work dir in HCFS", 0, sh.getRet());
@@ -117,22 +117,22 @@ public class TestMahoutExamples {
}
/**
- * Run method that tests for 0 return code and logs the entire command.
- */
+ * Run method that tests for 0 return code and logs the entire command.
+ */
public void assertRun(String mahoutJob){
final String cmd = MAHOUT+" "+mahoutJob;
- //Cat the commands to a central file thats easy to tail.
- //TODO a simpler
- sh.exec("echo \""+cmd+"\" >> /var/log/mahout.smoke");
- sh.exec(cmd);
+ //Cat the commands to a central file thats easy to tail.
+ //TODO a simpler
+ sh.exec("echo \""+cmd+"\" &gt;&gt; /var/log/mahout.smoke");
+ sh.exec(cmd);
assertEquals("non-zero return! :::: "+cmd + " :::: out= " + sh.out + " :::: err= "+sh.err, 0, sh.getRet());
}
@AfterClass
public static void tearDown() {
- sh.exec("rm -rf ${TEMP_DIR}",
- "hadoop fs -rmr ${WORK_DIR}");
+ sh.exec("rm -rf ${TEMP_DIR}",
+ "hadoop fs -rmr ${WORK_DIR}");
}
private static void rmr(String path) {
@@ -148,54 +148,54 @@ public class TestMahoutExamples {
sh.exec("mapred job -list | grep 'Total jobs:0'");
if (sh.getRet() == 0) {
sh.exec("for jobid in `mapred job -list | grep 'RUNNING' |awk '{print \$1}'`;",
- "do mapred job -kill \${jobid};",
- "done");
+ "do mapred job -kill \${jobid};",
+ "done");
}
}
- //iterations for factorizer, original value was "10",
- //on a small 4 node cluster, 2 iterations
- //should complete in about 5 minutes or so.
- static final int ITERATIONS=2;
-
- /**
- * This is the full workflow for creating recommendations based on movie
- * ratings including creating training/test data, ALS for training, evaluating
- * the ALS, and then outputting final movie recommendations for users.
- */
+ //iterations for factorizer, original value was "10",
+ //on a small 4 node cluster, 2 iterations
+ //should complete in about 5 minutes or so.
+ static final int ITERATIONS=2;
+
+ /**
+ * This is the full workflow for creating recommendations based on movie
+ * ratings including creating training/test data, ALS for training, evaluating
+ * the ALS, and then outputting final movie recommendations for users.
+ */
@Test(timeout=12000000L)
public void factorizeMovieLensRatings() {
// convert ratings
- sh.exec("cat ${TEMP_DIR}/movielens/ml-1m/ratings.dat |sed -e s/::/,/g| cut -d, -f1,2,3 > ${TEMP_DIR}/movielens/ratings.csv");
+ sh.exec("cat ${TEMP_DIR}/movielens/ml-1m/ratings.dat |sed -e s/::/,/g| cut -d, -f1,2,3 &gt; ${TEMP_DIR}/movielens/ratings.csv");
assertEquals("Unexpected error from converting ratings", 0, sh.getRet());
// put ratings in hdfs
sh.exec("hadoop fs -mkdir ${WORK_DIR}/movielens",
- "hadoop fs -put ${TEMP_DIR}/movielens/ratings.csv ${WORK_DIR}/movielens/ratings.csv");
+ "hadoop fs -put ${TEMP_DIR}/movielens/ratings.csv ${WORK_DIR}/movielens/ratings.csv");
assertEquals("Unable to put movielens/ratings.csv in hdfs", 0, sh.getRet());
//create a 90% percent training set and a 10% probe set
assertRun("splitDataset --input ${WORK_DIR}/movielens/ratings.csv --output ${WORK_DIR}/dataset " +
- "--trainingPercentage 0.9 --probePercentage 0.1 --tempDir ${WORK_DIR}/dataset/tmp");
+ "--trainingPercentage 0.9 --probePercentage 0.1 --tempDir ${WORK_DIR}/dataset/tmp");
- //Default iterations was 10, but for simple smokes that most might run,
- //2 iterations will confirm enough to move on.
+ //Default iterations was 10, but for simple smokes that most might run,
+ //2 iterations will confirm enough to move on.
+
+ //run distributed ALS-WR to factorize the rating matrix based on the training set
- //run distributed ALS-WR to factorize the rating matrix based on the training set
-
assertRun("parallelALS --input ${WORK_DIR}/dataset/trainingSet/ --output ${WORK_DIR}/als/out " +
- "--tempDir ${WORK_DIR}/als/tmp --numFeatures 20 --numIterations ${ITERATIONS} --lambda 0.065");
+ "--tempDir ${WORK_DIR}/als/tmp --numFeatures 20 --numIterations ${ITERATIONS} --lambda 0.065");
- //remove this
- sh.exec("hadoop fs -ls ${WORK_DIR}/als/out >> /tmp/mahoutdebug");
+ //remove this
+ sh.exec("hadoop fs -ls ${WORK_DIR}/als/out &gt;&gt; /tmp/mahoutdebug");
//compute predictions against the probe set, measure the error
assertRun("evaluateFactorization --output ${WORK_DIR}/als/rmse --input ${WORK_DIR}/dataset/probeSet/ " +
- "--userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/ --tempDir ${WORK_DIR}/als/tmp");
-
+ "--userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/ --tempDir ${WORK_DIR}/als/tmp");
+
//compute recommendations
assertRun("recommendfactorized --input ${WORK_DIR}/als/out/userRatings/ --output ${WORK_DIR}/recommendations " +
- "--userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/ " +
- "--numRecommendations 6 --maxRating 5");
+ "--userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/ " +
+ "--numRecommendations 6 --maxRating 5");
// check that error has been calculated
assertEquals("${WORK_DIR}/als/rmse/rmse.txt does not exist", 0, sh.getRet());
@@ -208,22 +208,22 @@ public class TestMahoutExamples {
assertEquals("${WORK_DIR}/recommendations/part-m-00000 does not exist", 0, sh.getRet());
}
- /**
- * Alternative to parameterized test: this is a test that is implemented by each
- * individual clustering test.
- *
- * Explanation of clustering tests:
- *
- * Each of the below tests runs a different clustering algorithm against the same
- * input data set, against synthesize "control" data. "Control data" is data that shows
- * the time series performance of a process. For example, a cellphone company
- * might want to run this to find which regions have decreasing performance over time (i.e. due to increased population),
- * versus places which have cyclic performance (i.e. due to weather).
- */
+ /**
+ * Alternative to parameterized test: this is a test that is implemented by each
+ * individual clustering test.
+ *
+ * Explanation of clustering tests:
+ *
+ * Each of the below tests runs a different clustering algorithm against the same
+ * input data set, against synthesize "control" data. "Control data" is data that shows
+ * the time series performance of a process. For example, a cellphone company
+ * might want to run this to find which regions have decreasing performance over time (i.e. due to increased population),
+ * versus places which have cyclic performance (i.e. due to weather).
+ */
private void _clusterSyntheticControlData(String algorithm) {
rmr("testdata");
sh.exec("hadoop fs -mkdir testdata",
- "hadoop fs -put ${download_dir}/synthetic_control.data testdata");
+ "hadoop fs -put ${download_dir}/synthetic_control.data testdata");
assertEquals("Unable to put data in hdfs", 0, sh.getRet());
assertRun("org.apache.mahout.clustering.syntheticcontrol.${algorithm}.Job");
assertEquals("Unexpected error from running mahout", 0, sh.getRet());
@@ -244,19 +244,9 @@ public class TestMahoutExamples {
_clusterSyntheticControlData("fuzzykmeans");
}
- @Test(timeout=900000L)
- public void clusterControlDataWithDirichlet() {
- _clusterSyntheticControlData("dirichlet");
- }
-
- @Test(timeout=900000L)
- public void clusterControlDataWithMeanShift() {
- _clusterSyntheticControlData("meanshift");
- }
-
- /**
- * Test the creation of topical clusters from raw lists words using LDA.
- */
+ /**
+ * Test the creation of topical clusters from raw lists words using LDA.
+ */
@Test(timeout=7200000L)
public void testReutersLDA() {
// where does lda.algorithm come in?
@@ -268,12 +258,12 @@ public class TestMahoutExamples {
assertRun("seqdirectory -i ${TEMP_DIR}/reuters-out -o ${TEMP_DIR}/reuters-out-seqdir -c UTF-8 -chunk 5");
assertEquals("Unexpected error from running mahout", 0, sh.getRet());
/*
- // reuters-out-seqdir exists on a local disk at this point,
- // copy it to hdfs
- rmr("${WORK_DIR}/reuters-out-seqdir");
- sh.exec("hadoop fs -put ${TEMP_DIR}/reuters-out-seqdir ${WORK_DIR}/reuters-out-seqdir");
- assertEquals("Unable to put reuters-out-seqdir in hdfs", 0, sh.getRet());
- */
+ // reuters-out-seqdir exists on a local disk at this point,
+ // copy it to hdfs
+ rmr("${WORK_DIR}/reuters-out-seqdir");
+ sh.exec("hadoop fs -put ${TEMP_DIR}/reuters-out-seqdir ${WORK_DIR}/reuters-out-seqdir");
+ assertEquals("Unable to put reuters-out-seqdir in hdfs", 0, sh.getRet());
+ */
assertRun("""seq2sparse \
-i ${WORK_DIR}/reuters-out-seqdir/ \
-o ${WORK_DIR}/reuters-out-seqdir-sparse-lda \
@@ -292,10 +282,10 @@ mahout ldatopics \
-dt sequencefile""");
}
- /**
- * Note that this test doesnt work on some older mahout versions.
- */
- @Test(timeout=9000000L)
+ /**
+ * Note that this test doesnt work on some older mahout versions.
+ */
+ @Test(timeout=9000000L)
public void testBayesNewsgroupClassifier() {
// put bayes-train-input and bayes-test-input in hdfs
sh.exec("hadoop fs -mkdir ${WORK_DIR}/20news-vectors");