文档章节

oneHot编码--标准化--主成分--聚类

hblt-j
 hblt-j
发布于 2017/08/29 11:44
字数 2642
阅读 10
收藏 0
点赞 0
评论 0

1.导入包

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

import org.apache.spark.sql.SparkSession

import org.apache.spark.sql.Dataset

import org.apache.spark.sql.Row

import org.apache.spark.sql.DataFrame

import org.apache.spark.sql.Column

import org.apache.spark.sql.DataFrameReader

import org.apache.spark.rdd.RDD

import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder

import org.apache.spark.sql.Encoder

import org.apache.spark.sql.functions._

import org.apache.spark.sql.DataFrameStatFunctions

import org.apache.spark.ml.linalg.Vectors

import org.apache.spark.ml.feature.StringIndexer

import org.apache.spark.ml.feature.OneHotEncoder

import org.apache.spark.ml.feature.VectorAssembler

import org.apache.spark.ml.feature.MinMaxScaler

import org.apache.spark.ml.feature.StandardScaler

import org.apache.spark.ml.feature.PCA

import org.apache.spark.ml.clustering.KMeans

 

2.导入数据

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

val spark = SparkSession.builder().appName("Spark SQL basic example").config("spark.some.config.option", "some-value").getOrCreate()

    

// For implicit conversions like converting RDDs to DataFrames

import spark.implicits._

    

val data: DataFrame = spark.read.format("csv").option("header", true).load("hdfs://ns1/datafile/wangxiao/Affairs.csv")

data: org.apache.spark.sql.DataFrame = [affairs: string, gender: string ... 7 more fields]

    

data.cache

res0: data.type = [affairs: string, gender: string ... 7 more fields]

   

data.limit(10).show()

+-------+------+---+------------+--------+-------------+---------+----------+------+

|affairs|gender|age|yearsmarried|children|religiousness|education|occupation|rating|

+-------+------+---+------------+--------+-------------+---------+----------+------+

|      0|  male| 37|          10|      no|            3|       18|         7|     4|

|      0|female| 27|           4|      no|            4|       14|         6|     4|

|      0|female| 32|          15|     yes|            1|       12|         1|     4|

|      0|  male| 57|          15|     yes|            5|       18|         6|     5|

|      0|  male| 22|        0.75|      no|            2|       17|         6|     3|

|      0|female| 32|         1.5|      no|            2|       17|         5|     5|

|      0|female| 22|        0.75|      no|            2|       12|         1|     3|

|      0|  male| 57|          15|     yes|            2|       14|         4|     4|

|      0|female| 32|          15|     yes|            4|       16|         1|     2|

|      0|  male| 22|         1.5|      no|            4|       14|         4|     5|

+-------+------+---+------------+--------+-------------+---------+----------+------+

    

// 转换字符类型,将Double和String的字段分开放

val data1 = data.select(

     |   data("affairs").cast("Double"),

     |   data("age").cast("Double"),

     |   data("yearsmarried").cast("Double"),

     |   data("religiousness").cast("Double"),

     |   data("education").cast("Double"),

     |   data("occupation").cast("Double"),

     |   data("rating").cast("Double"),

     |   data("gender").cast("String"),

     |   data("children").cast("String"))

data1: org.apache.spark.sql.DataFrame = [affairs: double, age: double ... 7 more fields]

    

data1.printSchema()

root

 |-- affairs: double (nullable = true)

 |-- age: double (nullable = true)

 |-- yearsmarried: double (nullable = true)

 |-- religiousness: double (nullable = true)

 |-- education: double (nullable = true)

 |-- occupation: double (nullable = true)

 |-- rating: double (nullable = true)

 |-- gender: string (nullable = true)

 |-- children: string (nullable = true)

    

    

data1.limit(10).show

+-------+----+------------+-------------+---------+----------+------+------+--------+

|affairs| age|yearsmarried|religiousness|education|occupation|rating|gender|children|

+-------+----+------------+-------------+---------+----------+------+------+--------+

|    0.0|37.0|        10.0|          3.0|     18.0|       7.0|   4.0|  male|      no|

|    0.0|27.0|         4.0|          4.0|     14.0|       6.0|   4.0|female|      no|

|    0.0|32.0|        15.0|          1.0|     12.0|       1.0|   4.0|female|     yes|

|    0.0|57.0|        15.0|          5.0|     18.0|       6.0|   5.0|  male|     yes|

|    0.0|22.0|        0.75|          2.0|     17.0|       6.0|   3.0|  male|      no|

|    0.0|32.0|         1.5|          2.0|     17.0|       5.0|   5.0|female|      no|

|    0.0|22.0|        0.75|          2.0|     12.0|       1.0|   3.0|female|      no|

|    0.0|57.0|        15.0|          2.0|     14.0|       4.0|   4.0|  male|     yes|

|    0.0|32.0|        15.0|          4.0|     16.0|       1.0|   2.0|female|     yes|

|    0.0|22.0|         1.5|          4.0|     14.0|       4.0|   5.0|  male|      no|

+-------+----+------------+-------------+---------+----------+------+------+--------+

    

val dataDF = data1

dataDF: org.apache.spark.sql.DataFrame = [affairs: double, age: double ... 7 more fields]

    

dataDF.cache()

res4: dataDF.type = [affairs: double, age: double ... 7 more fields]

 

3.字符转换成数字索引,OneHot编码,注意setDropLast设置为false

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

字符转换成数字索引

val indexer = new StringIndexer().setInputCol("gender").setOutputCol("genderIndex").fit(dataDF)

indexer: org.apache.spark.ml.feature.StringIndexerModel = strIdx_27dba613193a

    

val indexed = indexer.transform(dataDF)

indexed: org.apache.spark.sql.DataFrame = [affairs: double, age: double ... 8 more fields]

    

// OneHot编码,注意setDropLast设置为false

val encoder = new OneHotEncoder().setInputCol("genderIndex").setOutputCol("genderVec").setDropLast(false)

encoder: org.apache.spark.ml.feature.OneHotEncoder = oneHot_155a53de3aef

    

val encoded = encoder.transform(indexed)

encoded: org.apache.spark.sql.DataFrame = [affairs: double, age: double ... 9 more fields]

    

encoded.show()

+-------+----+------------+-------------+---------+----------+------+------+--------+-----------+-------------+

|affairs| age|yearsmarried|religiousness|education|occupation|rating|gender|children|genderIndex|    genderVec|

+-------+----+------------+-------------+---------+----------+------+------+--------+-----------+-------------+

|    0.0|37.0|        10.0|          3.0|     18.0|       7.0|   4.0|  male|      no|        1.0|(2,[1],[1.0])|

|    0.0|27.0|         4.0|          4.0|     14.0|       6.0|   4.0|female|      no|        0.0|(2,[0],[1.0])|

|    0.0|32.0|        15.0|          1.0|     12.0|       1.0|   4.0|female|     yes|        0.0|(2,[0],[1.0])|

|    0.0|57.0|        15.0|          5.0|     18.0|       6.0|   5.0|  male|     yes|        1.0|(2,[1],[1.0])|

|    0.0|22.0|        0.75|          2.0|     17.0|       6.0|   3.0|  male|      no|        1.0|(2,[1],[1.0])|

|    0.0|32.0|         1.5|          2.0|     17.0|       5.0|   5.0|female|      no|        0.0|(2,[0],[1.0])|

|    0.0|22.0|        0.75|          2.0|     12.0|       1.0|   3.0|female|      no|        0.0|(2,[0],[1.0])|

|    0.0|57.0|        15.0|          2.0|     14.0|       4.0|   4.0|  male|     yes|        1.0|(2,[1],[1.0])|

|    0.0|32.0|        15.0|          4.0|     16.0|       1.0|   2.0|female|     yes|        0.0|(2,[0],[1.0])|

|    0.0|22.0|         1.5|          4.0|     14.0|       4.0|   5.0|  male|      no|        1.0|(2,[1],[1.0])|

|    0.0|37.0|        15.0|          2.0|     20.0|       7.0|   2.0|  male|     yes|        1.0|(2,[1],[1.0])|

|    0.0|27.0|         4.0|          4.0|     18.0|       6.0|   4.0|  male|     yes|        1.0|(2,[1],[1.0])|

|    0.0|47.0|        15.0|          5.0|     17.0|       6.0|   4.0|  male|     yes|        1.0|(2,[1],[1.0])|

|    0.0|22.0|         1.5|          2.0|     17.0|       5.0|   4.0|female|      no|        0.0|(2,[0],[1.0])|

|    0.0|27.0|         4.0|          4.0|     14.0|       5.0|   4.0|female|      no|        0.0|(2,[0],[1.0])|

|    0.0|37.0|        15.0|          1.0|     17.0|       5.0|   5.0|female|     yes|        0.0|(2,[0],[1.0])|

|    0.0|37.0|        15.0|          2.0|     18.0|       4.0|   3.0|female|     yes|        0.0|(2,[0],[1.0])|

|    0.0|22.0|        0.75|          3.0|     16.0|       5.0|   4.0|female|      no|        0.0|(2,[0],[1.0])|

|    0.0|22.0|         1.5|          2.0|     16.0|       5.0|   5.0|female|      no|        0.0|(2,[0],[1.0])|

|    0.0|27.0|        10.0|          2.0|     14.0|       1.0|   5.0|female|     yes|        0.0|(2,[0],[1.0])|

+-------+----+------------+-------------+---------+----------+------+------+--------+-----------+-------------+

only showing top 20 rows

   

val indexer1 = new StringIndexer().setInputCol("children").setOutputCol("childrenIndex").fit(encoded)

indexer1: org.apache.spark.ml.feature.StringIndexerModel = strIdx_55db099c07b7

    

val indexed1 = indexer1.transform(encoded)

indexed1: org.apache.spark.sql.DataFrame = [affairs: double, age: double ... 10 more fields]

    

val encoder1 = new OneHotEncoder().setInputCol("childrenIndex").setOutputCol("childrenVec").setDropLast(false)

    

val encoded1 = encoder1.transform(indexed1)

encoded1: org.apache.spark.sql.DataFrame = [affairs: double, age: double ... 11 more fields]

    

encoded1.show()

+-------+----+------------+-------------+---------+----------+------+------+--------+-----------+-------------+-------------+-------------+

|affairs| age|yearsmarried|religiousness|education|occupation|rating|gender|children|genderIndex|    genderVec|childrenIndex|  childrenVec|

+-------+----+------------+-------------+---------+----------+------+------+--------+-----------+-------------+-------------+-------------+

|    0.0|37.0|        10.0|          3.0|     18.0|       7.0|   4.0|  male|      no|        1.0|(2,[1],[1.0])|          1.0|(2,[1],[1.0])|

|    0.0|27.0|         4.0|          4.0|     14.0|       6.0|   4.0|female|      no|        0.0|(2,[0],[1.0])|          1.0|(2,[1],[1.0])|

|    0.0|32.0|        15.0|          1.0|     12.0|       1.0|   4.0|female|     yes|        0.0|(2,[0],[1.0])|          0.0|(2,[0],[1.0])|

|    0.0|57.0|        15.0|          5.0|     18.0|       6.0|   5.0|  male|     yes|        1.0|(2,[1],[1.0])|          0.0|(2,[0],[1.0])|

|    0.0|22.0|        0.75|          2.0|     17.0|       6.0|   3.0|  male|      no|        1.0|(2,[1],[1.0])|          1.0|(2,[1],[1.0])|

|    0.0|32.0|         1.5|          2.0|     17.0|       5.0|   5.0|female|      no|        0.0|(2,[0],[1.0])|          1.0|(2,[1],[1.0])|

|    0.0|22.0|        0.75|          2.0|     12.0|       1.0|   3.0|female|      no|        0.0|(2,[0],[1.0])|          1.0|(2,[1],[1.0])|

|    0.0|57.0|        15.0|          2.0|     14.0|       4.0|   4.0|  male|     yes|        1.0|(2,[1],[1.0])|          0.0|(2,[0],[1.0])|

|    0.0|32.0|        15.0|          4.0|     16.0|       1.0|   2.0|female|     yes|        0.0|(2,[0],[1.0])|          0.0|(2,[0],[1.0])|

|    0.0|22.0|         1.5|          4.0|     14.0|       4.0|   5.0|  male|      no|        1.0|(2,[1],[1.0])|          1.0|(2,[1],[1.0])|

|    0.0|37.0|        15.0|          2.0|     20.0|       7.0|   2.0|  male|     yes|        1.0|(2,[1],[1.0])|          0.0|(2,[0],[1.0])|

|    0.0|27.0|         4.0|          4.0|     18.0|       6.0|   4.0|  male|     yes|        1.0|(2,[1],[1.0])|          0.0|(2,[0],[1.0])|

|    0.0|47.0|        15.0|          5.0|     17.0|       6.0|   4.0|  male|     yes|        1.0|(2,[1],[1.0])|          0.0|(2,[0],[1.0])|

|    0.0|22.0|         1.5|          2.0|     17.0|       5.0|   4.0|female|      no|        0.0|(2,[0],[1.0])|          1.0|(2,[1],[1.0])|

|    0.0|27.0|         4.0|          4.0|     14.0|       5.0|   4.0|female|      no|        0.0|(2,[0],[1.0])|          1.0|(2,[1],[1.0])|

|    0.0|37.0|        15.0|          1.0|     17.0|       5.0|   5.0|female|     yes|        0.0|(2,[0],[1.0])|          0.0|(2,[0],[1.0])|

|    0.0|37.0|        15.0|          2.0|     18.0|       4.0|   3.0|female|     yes|        0.0|(2,[0],[1.0])|          0.0|(2,[0],[1.0])|

|    0.0|22.0|        0.75|          3.0|     16.0|       5.0|   4.0|female|      no|        0.0|(2,[0],[1.0])|          1.0|(2,[1],[1.0])|

|    0.0|22.0|         1.5|          2.0|     16.0|       5.0|   5.0|female|      no|        0.0|(2,[0],[1.0])|          1.0|(2,[1],[1.0])|

|    0.0|27.0|        10.0|          2.0|     14.0|       1.0|   5.0|female|     yes|        0.0|(2,[0],[1.0])|          0.0|(2,[0],[1.0])|

+-------+----+------------+-------------+---------+----------+------+------+--------+-----------+-------------+-------------+-------------+

only showing top 20 rows

   

    

val encodeDF: DataFrame = encoded1

encodeDF: org.apache.spark.sql.DataFrame = [affairs: double, age: double ... 11 more fields]

    

encodeDF.show()

+-------+----+------------+-------------+---------+----------+------+------+--------+-----------+-------------+-------------+-------------+

|affairs| age|yearsmarried|religiousness|education|occupation|rating|gender|children|genderIndex|    genderVec|childrenIndex|  childrenVec|

+-------+----+------------+-------------+---------+----------+------+------+--------+-----------+-------------+-------------+-------------+

|    0.0|37.0|        10.0|          3.0|     18.0|       7.0|   4.0|  male|      no|        1.0|(2,[1],[1.0])|          1.0|(2,[1],[1.0])|

|    0.0|27.0|         4.0|          4.0|     14.0|       6.0|   4.0|female|      no|        0.0|(2,[0],[1.0])|          1.0|(2,[1],[1.0])|

|    0.0|32.0|        15.0|          1.0|     12.0|       1.0|   4.0|female|     yes|        0.0|(2,[0],[1.0])|          0.0|(2,[0],[1.0])|

|    0.0|57.0|        15.0|          5.0|     18.0|       6.0|   5.0|  male|     yes|        1.0|(2,[1],[1.0])|          0.0|(2,[0],[1.0])|

|    0.0|22.0|        0.75|          2.0|     17.0|       6.0|   3.0|  male|      no|        1.0|(2,[1],[1.0])|          1.0|(2,[1],[1.0])|

|    0.0|32.0|         1.5|          2.0|     17.0|       5.0|   5.0|female|      no|        0.0|(2,[0],[1.0])|          1.0|(2,[1],[1.0])|

|    0.0|22.0|        0.75|          2.0|     12.0|       1.0|   3.0|female|      no|        0.0|(2,[0],[1.0])|          1.0|(2,[1],[1.0])|

|    0.0|57.0|        15.0|          2.0|     14.0|       4.0|   4.0|  male|     yes|        1.0|(2,[1],[1.0])|          0.0|(2,[0],[1.0])|

|    0.0|32.0|        15.0|          4.0|     16.0|       1.0|   2.0|female|     yes|        0.0|(2,[0],[1.0])|          0.0|(2,[0],[1.0])|

|    0.0|22.0|         1.5|          4.0|     14.0|       4.0|   5.0|  male|      no|        1.0|(2,[1],[1.0])|          1.0|(2,[1],[1.0])|

|    0.0|37.0|        15.0|          2.0|     20.0|       7.0|   2.0|  male|     yes|        1.0|(2,[1],[1.0])|          0.0|(2,[0],[1.0])|

|    0.0|27.0|         4.0|          4.0|     18.0|       6.0|   4.0|  male|     yes|        1.0|(2,[1],[1.0])|          0.0|(2,[0],[1.0])|

|    0.0|47.0|        15.0|          5.0|     17.0|       6.0|   4.0|  male|     yes|        1.0|(2,[1],[1.0])|          0.0|(2,[0],[1.0])|

|    0.0|22.0|         1.5|          2.0|     17.0|       5.0|   4.0|female|      no|        0.0|(2,[0],[1.0])|          1.0|(2,[1],[1.0])|

|    0.0|27.0|         4.0|          4.0|     14.0|       5.0|   4.0|female|      no|        0.0|(2,[0],[1.0])|          1.0|(2,[1],[1.0])|

|    0.0|37.0|        15.0|          1.0|     17.0|       5.0|   5.0|female|     yes|        0.0|(2,[0],[1.0])|          0.0|(2,[0],[1.0])|

|    0.0|37.0|        15.0|          2.0|     18.0|       4.0|   3.0|female|     yes|        0.0|(2,[0],[1.0])|          0.0|(2,[0],[1.0])|

|    0.0|22.0|        0.75|          3.0|     16.0|       5.0|   4.0|female|      no|        0.0|(2,[0],[1.0])|          1.0|(2,[1],[1.0])|

|    0.0|22.0|         1.5|          2.0|     16.0|       5.0|   5.0|female|      no|        0.0|(2,[0],[1.0])|          1.0|(2,[1],[1.0])|

|    0.0|27.0|        10.0|          2.0|     14.0|       1.0|   5.0|female|     yes|        0.0|(2,[0],[1.0])|          0.0|(2,[0],[1.0])|

+-------+----+------------+-------------+---------+----------+------+------+--------+-----------+-------------+-------------+-------------+

only showing top 20 rows

    

    

encodeDF.printSchema()

root

 |-- affairs: double (nullable = true)

 |-- age: double (nullable = true)

 |-- yearsmarried: double (nullable = true)

 |-- religiousness: double (nullable = true)

 |-- education: double (nullable = true)

 |-- occupation: double (nullable = true)

 |-- rating: double (nullable = true)

 |-- gender: string (nullable = true)

 |-- children: string (nullable = true)

 |-- genderIndex: double (nullable = true)

 |-- genderVec: vector (nullable = true)

 |-- childrenIndex: double (nullable = true)

 |-- childrenVec: vector (nullable = true)

 

4.将字段组合成向量feature

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

//将字段组合成向量feature

val assembler = new VectorAssembler().setInputCols(Array("affairs", "age", "yearsmarried", "religiousness", "education", "occupation", "rating", "genderVec", "childrenVec")).setOutputCol("features")

assembler: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_df76d5d1e3f4

    

val vecDF: DataFrame = assembler.transform(encodeDF)

vecDF: org.apache.spark.sql.DataFrame = [affairs: double, age: double ... 12 more fields]

    

vecDF.select("features").show

+--------------------+

|            features|

+--------------------+

|[0.0,37.0,10.0,3....|

|[0.0,27.0,4.0,4.0...|

|[0.0,32.0,15.0,1....|

|[0.0,57.0,15.0,5....|

|[0.0,22.0,0.75,2....|

|[0.0,32.0,1.5,2.0...|

|[0.0,22.0,0.75,2....|

|[0.0,57.0,15.0,2....|

|[0.0,32.0,15.0,4....|

|[0.0,22.0,1.5,4.0...|

|[0.0,37.0,15.0,2....|

|[0.0,27.0,4.0,4.0...|

|[0.0,47.0,15.0,5....|

|[0.0,22.0,1.5,2.0...|

|[0.0,27.0,4.0,4.0...|

|[0.0,37.0,15.0,1....|

|[0.0,37.0,15.0,2....|

|[0.0,22.0,0.75,3....|

|[0.0,22.0,1.5,2.0...|

|[0.0,27.0,10.0,2....|

+--------------------+

only showing top 20 rows

 

5.标准化--均值标准差

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

// 标准化--均值标准差

val scaler = new StandardScaler().setInputCol("features").setOutputCol("scaledFeatures").setWithStd(true).setWithMean(true)

scaler: org.apache.spark.ml.feature.StandardScaler = stdScal_43d3da1cd3bf

    

// Compute summary statistics by fitting the StandardScaler.

val scalerModel = scaler.fit(vecDF)

scalerModel: org.apache.spark.ml.feature.StandardScalerModel = stdScal_43d3da1cd3bf

    

// Normalize each feature to have unit standard deviation.

val scaledData: DataFrame = scalerModel.transform(vecDF)

scaledData: org.apache.spark.sql.DataFrame = [affairs: double, age: double ... 13 more fields]

    

scaledData.select("features", "scaledFeatures").show

+--------------------+--------------------+

|            features|      scaledFeatures|

+--------------------+--------------------+

|[0.0,37.0,10.0,3....|[-0.4413500298573...|

|[0.0,27.0,4.0,4.0...|[-0.4413500298573...|

|[0.0,32.0,15.0,1....|[-0.4413500298573...|

|[0.0,57.0,15.0,5....|[-0.4413500298573...|

|[0.0,22.0,0.75,2....|[-0.4413500298573...|

|[0.0,32.0,1.5,2.0...|[-0.4413500298573...|

|[0.0,22.0,0.75,2....|[-0.4413500298573...|

|[0.0,57.0,15.0,2....|[-0.4413500298573...|

|[0.0,32.0,15.0,4....|[-0.4413500298573...|

|[0.0,22.0,1.5,4.0...|[-0.4413500298573...|

|[0.0,37.0,15.0,2....|[-0.4413500298573...|

|[0.0,27.0,4.0,4.0...|[-0.4413500298573...|

|[0.0,47.0,15.0,5....|[-0.4413500298573...|

|[0.0,22.0,1.5,2.0...|[-0.4413500298573...|

|[0.0,27.0,4.0,4.0...|[-0.4413500298573...|

|[0.0,37.0,15.0,1....|[-0.4413500298573...|

|[0.0,37.0,15.0,2....|[-0.4413500298573...|

|[0.0,22.0,0.75,3....|[-0.4413500298573...|

|[0.0,22.0,1.5,2.0...|[-0.4413500298573...|

|[0.0,27.0,10.0,2....|[-0.4413500298573...|

+--------------------+--------------------+

only showing top 20 rows

 

6.主成分PCA

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

// 主成分

val pca = new PCA().setInputCol("scaledFeatures").setOutputCol("pcaFeatures").setK(3).fit(scaledData)

    

pca.explainedVariance.values //解释变量方差

res11: Array[Double] = Array(0.28779526464781313, 0.23798543640278289, 0.11742828783633019)

    

pca.pc //载荷(观测变量与主成分的相关系数)

res12: org.apache.spark.ml.linalg.DenseMatrix =

-0.12034310848156521  0.05153952289637974   0.6678769450480689

-0.42860623714516627  0.05417889891307473   -0.05592377098140197

-0.44404074412877986  0.1926596811059294    -0.017025575192258197

-0.12233707317255231  0.08053139375662526   -0.5093149296300096

-0.14664751606128462  -0.3872166556211308   -0.03406819489501708

-0.145543746024348    -0.43054860653839705  0.07841454709046872

0.17703994181974803   -0.12792784984216296  -0.5173229755329072

0.2459668445061567    0.4915809641798787    0.010477548320795945

-0.2459668445061567   -0.4915809641798787   -0.010477548320795945

-0.44420980045271047  0.240652448514566     -0.089356723885704

0.4442098004527103    -0.24065244851456588  0.08935672388570405

    

pca.extractParamMap()

res13: org.apache.spark.ml.param.ParamMap =

{

    pca_40a453a54776-inputCol: scaledFeatures,

    pca_40a453a54776-k: 3,

    pca_40a453a54776-outputCol: pcaFeatures

}

    

pca.params

res14: Array[org.apache.spark.ml.param.Param[_]] = Array(pca_40a453a54776__inputCol, pca_40a453a54776__k, pca_40a453a54776__outputCol)

    

   

    

val pcaDF: DataFrame = pca.transform(scaledData)

pcaDF: org.apache.spark.sql.DataFrame = [affairs: double, age: double ... 14 more fields]

    

pcaDF.cache()

res15: pcaDF.type = [affairs: double, age: double ... 14 more fields]

    

    

pcaDF.printSchema()

root

 |-- affairs: double (nullable = true)

 |-- age: double (nullable = true)

 |-- yearsmarried: double (nullable = true)

 |-- religiousness: double (nullable = true)

 |-- education: double (nullable = true)

 |-- occupation: double (nullable = true)

 |-- rating: double (nullable = true)

 |-- gender: string (nullable = true)

 |-- children: string (nullable = true)

 |-- genderIndex: double (nullable = true)

 |-- genderVec: vector (nullable = true)

 |-- childrenIndex: double (nullable = true)

 |-- childrenVec: vector (nullable = true)

 |-- features: vector (nullable = true)

 |-- scaledFeatures: vector (nullable = true)

 |-- pcaFeatures: vector (nullable = true)

    

    

pcaDF.select("features", "scaledFeatures", "pcaFeatures").show

+--------------------+--------------------+--------------------+

|            features|      scaledFeatures|         pcaFeatures|

+--------------------+--------------------+--------------------+

|[0.0,37.0,10.0,3....|[-0.4413500298573...|[0.27828160409293...|

|[0.0,27.0,4.0,4.0...|[-0.4413500298573...|[2.42147114101165...|

|[0.0,32.0,15.0,1....|[-0.4413500298573...|[0.18301418047489...|

|[0.0,57.0,15.0,5....|[-0.4413500298573...|[-2.9795960667914...|

|[0.0,22.0,0.75,2....|[-0.4413500298573...|[1.79299133565688...|

|[0.0,32.0,1.5,2.0...|[-0.4413500298573...|[2.65694237441759...|

|[0.0,22.0,0.75,2....|[-0.4413500298573...|[3.48234503794570...|

|[0.0,57.0,15.0,2....|[-0.4413500298573...|[-2.4215838062079...|

|[0.0,32.0,15.0,4....|[-0.4413500298573...|[-0.6964555195741...|

|[0.0,22.0,1.5,4.0...|[-0.4413500298573...|[2.18771069800414...|

|[0.0,37.0,15.0,2....|[-0.4413500298573...|[-2.4259075891377...|

|[0.0,27.0,4.0,4.0...|[-0.4413500298573...|[-0.7743038356008...|

|[0.0,47.0,15.0,5....|[-0.4413500298573...|[-2.6176149267534...|

|[0.0,22.0,1.5,2.0...|[-0.4413500298573...|[2.95788535193022...|

|[0.0,27.0,4.0,4.0...|[-0.4413500298573...|[2.50146472861263...|

|[0.0,37.0,15.0,1....|[-0.4413500298573...|[-0.5123817022008...|

|[0.0,37.0,15.0,2....|[-0.4413500298573...|[-0.9191740114044...|

|[0.0,22.0,0.75,3....|[-0.4413500298573...|[2.97391491782863...|

|[0.0,22.0,1.5,2.0...|[-0.4413500298573...|[3.17940505267806...|

|[0.0,27.0,10.0,2....|[-0.4413500298573...|[0.74585406839527...|

+--------------------+--------------------+--------------------+

only showing top 20 rows

 

7.聚类

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

// 注意最大迭代次數和轮廓系数

    

val KSSE = (2 to 20 by 1).toList.map { k =>

      // 聚类

      // Trains a k-means model.

      val kmeans = new KMeans().setK(k).setSeed(1L).setFeaturesCol("scaledFeatures")

      val model = kmeans.fit(scaledData)

 

      // Evaluate clustering by computing Within Set Sum of Squared Errors.

      val WSSSE = model.computeCost(scaledData)

 

      // K,实际迭代次数,SSE,聚类类别编号,每类的记录数,类中心点

      (k, model.getMaxIter, WSSSE, model.summary.cluster, model.summary.clusterSizes, model.clusterCenters)

    }

 

    // 根据SSE确定K值

    val KSSEdf:DataFrame=KSSE.map{x=>(x._1,x._2,x._3,x._5)}.toDF("K", "MaxIter", "SSE", "clusterSizes")

    

KSSE.foreach(println)

 

© 著作权归作者所有

共有 人打赏支持
hblt-j
粉丝 14
博文 92
码字总数 11113
作品 0
海淀
架构师
GMM与K-means聚类效果实战

目录 一、数据探索和预处理 二、无监督学习-降维和聚类分析 三、聚类效果对比分析 四、小结和建议 备注 分析软件:python 数据已经分享在百度云:客户年消费数据 密码:lehv 该份数据中包含客...

weston_Xiang ⋅ 05/07 ⋅ 0

Python-用sklearn做特征工程

第18章 特征工程(Feature Engineering) 本章从商业数据分析和挖掘的角度详细地介绍了特征工程及其使用的响应方法:数据预处理、特征构造、特征抽取以及特征选择。系统性地说明了用于构建分...

Ben_Chang ⋅ 05/01 ⋅ 0

关于数据建模变量标准化,你想知道的都在这里了

很多人在建模前看到一组变量,都会有这样的一个问题,这些变量需要标准化吗?然后转身问了下身边的同事、教授。哦!原来要做标准化的,接着把所有变量转化成平均数为0,标准差为1的变量,开开...

Yan文怡 ⋅ 01/26 ⋅ 0

数据转换

一.标准化的原因 通常情况下是为了消除量纲的影响。譬如一个百分制的变量与一个5分值的变量在一起怎么比较?只有通过数据标准化,都把它们标准到同一个标准时才具有可比性,一般标准化采用的...

readilen ⋅ 2017/11/24 ⋅ 0

R 语言之数据分析高级方法「主成分分析」和「因子分析」

作者:姚某某 博客:https://zhuanlan.zhihu.com/mydata 往期回顾: 本节主要总结「数据分析」的「主成分分析」和「因子分析」的思想。 通过学习《 R 语言实战 》关于这两种方法的解释,我们...

kmd8d5r ⋅ 05/10 ⋅ 0

主成分分析和因子分析十大不同

主成分分析和因子分析无论从算法上还是应用上都有着比较相似之处,本文结合以往资料以及自己的理解总结了以下十大不同之处,适合初学者学习之用。 1.原理不同 主 成分分析基本原理:利用降维...

闵开慧 ⋅ 2013/08/22 ⋅ 0

机器学习实战:k-means实现客户分类

本文内容主要参考优达学城”机器学习(进阶)“纳米学位的课程项目。 文章福利:使用优惠码027C001B减免300元优达学城课程学费。 1. 背景介绍 在本文中,笔者将带领大家对客户的年均消费数据...

刘博 ⋅ 02/21 ⋅ 0

SAS中的聚类分析方法总结 ZT

说起聚类分析,相信很多人并不陌生。这篇原创博客我想简单说一下我所理解的聚类分析,欢迎各位高手不吝赐教和拍砖。 按照正常的思路,我大概会说如下几个问题: 1. 什么是聚类分析? 2. 聚类...

葬心 ⋅ 2014/06/12 ⋅ 0

【干货】Python无监督学习的4大聚类算法

     来源:towardsdatascience.com   作者:Vihar Kurama 翻译:肖琴   【新智元导读】无监督学习是机器学习技术中的一类,用于发现数据中的模式。本文介绍用Python进行无监督学习的...

深度学习 ⋅ 05/28 ⋅ 0

机器学习之sklearn(1)

使用sklearn做单机特征工程 目录 1 特征工程是什么? 2 数据预处理   2.1 无量纲化     2.1.1 标准化     2.1.2 区间缩放法     2.1.3 标准化与归一化的区别   2.2 对定量特...

john_wzq ⋅ 05/24 ⋅ 0

没有更多内容

加载失败,请刷新页面

加载更多

下一页

三步为你的App集成LivePhoto功能

摘要:LivePhoto是iOS9新推出的一种拍照方式,类似于拍摄Gif图或录制视频片段生成图片。如果没有画面感,可以联想《哈利波特》霍格沃茨城堡的壁画,哈哈,很炫酷有木有,但坑爹的是只有iphone6S以...

壹峰 ⋅ 16分钟前 ⋅ 0

centos7 git安装

由于centos中的源仓库中git不是最新版本,需要进行源码安装。 1、查看yum仓库git信息 [root@iZm5e3d4r5i5ml889vh6esZ zh]# yum info gitLoaded plugins: fastestmirrorLoading mirror s...

xixingzhe ⋅ 25分钟前 ⋅ 0

input file 重复上传同一张图片失效的解决办法

解决办法 方法一:来回切换input[type='file']的type属性值,可以是‘text’,'button','button'....,然后再切换回来‘file’ 方法二:每次取消图片预览后,重置input[type='file']的value的...

时刻在奔跑 ⋅ 26分钟前 ⋅ 0

Mahout推荐算法API详解

前言 用Mahout来构建推荐系统,是一件既简单又困难的事情。简单是因为Mahout完整地封装了“协同过滤”算法,并实现了并行化,提供非常简单的API接口;困难是因为我们不了解算法细节,很难去根...

xiaomin0322 ⋅ 30分钟前 ⋅ 0

WampServer默认web服务器根目录位置

安装WampServer之后的web服务器根目录默认位置在WampServer安装目录下的www:

临江仙卜算子 ⋅ 32分钟前 ⋅ 0

Redux的一些手法记录

Redux Redux的基本概念见另一篇文。 这里记录一下Redux在项目中的实际操作的手法。 actions 首先定义action.js,actions的type,可以另起一个action-type.js文件。 action-type.js用来存...

LinearLaw ⋅ 33分钟前 ⋅ 0

android 手势检测(左右滑动、上下滑动)

GestureDetector类可以让我们快速的处理手势事件,如点击,滑动等。 使用GestureDetector分三步: 1. 定义GestureDetector类 2. 初始化手势类,同时设置手势监听 3. 将touch事件交给gesture...

王先森oO ⋅ 47分钟前 ⋅ 0

java 方法的执行时间监控 设置超时(Future 接口)

java 方法的执行时间监控 设置超时(Future 接口) import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import java.util.concurrent.Executor......

青峰Jun19er ⋅ 52分钟前 ⋅ 0

一名开源小白的Apache成长自述

今天收到了来自Apache Vote我成为Serviceomb项目Committer的邮件,代表自己的贡献得到了充分的肯定;除了感谢团队的给力支持,我更希望将自己的成长经历——如何践行Apache Way的心得介绍给大...

微服务框架 ⋅ 54分钟前 ⋅ 0

vim介绍、颜色显示和移动光标、一般模式下复制、剪切和粘贴

1.vim 是 vi 的升级版 vim 是带有颜色显示的 mini安装的系统,一般都不带有vim [root@aminglinux-128 ~]# yum install -y vim-enhanced已加载插件:fastestmirror, langpacksLoading mir...

oschina130111 ⋅ 54分钟前 ⋅ 0

没有更多内容

加载失败,请刷新页面

加载更多

下一页

返回顶部
顶部