|
29 | 29 | sc.parallelize([(0L, "a b c d e spark", 1.0), |
30 | 30 | (1L, "b d", 0.0), |
31 | 31 | (2L, "spark f g h", 1.0), |
32 | | - (3L, "hadoop mapreduce", 0.0)]) \ |
| 32 | + (3L, "hadoop mapreduce", 0.0)]) |
33 | 33 | .map(lambda x: Row(id=x[0], text=x[1], label=x[2]))) |
34 | 34 |
|
35 | 35 | tokenizer = Tokenizer() \ |
36 | | - .setInputCol("text") \ |
37 | | - .setOutputCol("words") |
| 36 | + .setInputCol("text") \ |
| 37 | + .setOutputCol("words") |
38 | 38 | hashingTF = HashingTF() \ |
39 | | - .setInputCol(tokenizer.getOutputCol()) \ |
40 | | - .setOutputCol("features") |
| 39 | + .setInputCol(tokenizer.getOutputCol()) \ |
| 40 | + .setOutputCol("features") |
41 | 41 | lr = LogisticRegression() \ |
42 | | - .setMaxIter(10) \ |
43 | | - .setRegParam(0.01) |
| 42 | + .setMaxIter(10) \ |
| 43 | + .setRegParam(0.01) |
44 | 44 | pipeline = Pipeline() \ |
45 | | - .setStages([tokenizer, hashingTF, lr]) |
| 45 | + .setStages([tokenizer, hashingTF, lr]) |
46 | 46 |
|
47 | 47 | model = pipeline.fit(training) |
48 | 48 |
|
49 | 49 | test = sqlCtx.inferSchema( |
50 | 50 | sc.parallelize([(4L, "spark i j k"), |
51 | 51 | (5L, "l m n"), |
52 | 52 | (6L, "mapreduce spark"), |
53 | | - (7L, "apache hadoop")]) \ |
| 53 | + (7L, "apache hadoop")]) |
54 | 54 | .map(lambda x: Row(id=x[0], text=x[1]))) |
55 | 55 |
|
56 | 56 | for row in model.transform(test).collect(): |
|
0 commit comments