利用UDF对dataframe列数据进行修改

原创
2018/09/13 11:20
阅读数 4.1K

/*

import org.apache.spark.sql.functions._

val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._

*/


/*

https://stackoverflow.com/questions/34614239/how-to-apply-a-function-to-a-column-of-a-spark-dataframe

https://jaceklaskowski.gitbooks.io/mastering-spark-sql/spark-sql-Column.html

https://www.jianshu.com/p/833b72adb2b6

*/


import org.apache.spark.sql.functions.udf
val df = Seq((1, "jeden"), (2, "dwa"), (3, "jerry"), (0,"tom")).toDF("number", "polish")

scala> df.show
+------+------+
|number|polish|
+------+------+
|     1| jeden|
|     2|   dwa|
|     3| jerry|
|     0|   tom|
+------+------+



val label_class = udf((x:Int) => if(x>0) 1 else 0)
scala> df.withColumn("number", label_class($"number")).show
+------+------+
|number|polish|
+------+------+
|     1| jeden|
|     1|   dwa|
|     1| jerry|
|     0|   tom|
+------+------+

scala> val data = df.withColumn("number", label_class($"number"))
data: org.apache.spark.sql.DataFrame = [number: int, polish: string]

scala> data
res3: org.apache.spark.sql.DataFrame = [number: int, polish: string]

scala> data.show
+------+------+
|number|polish|
+------+------+
|     1| jeden|
|     1|   dwa|
|     1| jerry|
|     0|   tom|
+------+------+
展开阅读全文
打赏
0
0 收藏
分享
加载中
更多评论
打赏
0 评论
0 收藏
0
分享
返回顶部
顶部