maven:
<!-- spark --> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-core_2.10</artifactId> <version>1.6.0</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-sql_2.10</artifactId> <version>1.6.0</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-hive_2.10</artifactId> <version>1.6.0</version> </dependency> <!-- google工具类 --> <dependency> <groupId>com.google.guava</groupId> <artifactId>guava</artifactId> <version>18.0</version> </dependency>
public class UDF { public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("UDF").setMaster("local"); JavaSparkContext sc = new JavaSparkContext(conf); SQLContext sqlContext = new SQLContext(sc); List<String> nameList = Arrays.asList("xiaoming", "feifei", "katong"); //转换为javaRDD JavaRDD<String> nameRDD = sc.parallelize(nameList, 3); //转换为JavaRDD<Row> JavaRDD<Row> nameRowRDD = nameRDD.map(new Function<String, Row>() { public Row call(String name) throws Exception { return RowFactory.create(name); } }); List<StructField> fields = Lists.newArrayList(); fields.add(DataTypes.createStructField("name", DataTypes.StringType,true)); StructType structType = DataTypes.createStructType(fields); DataFrame namesDF = sqlContext.createDataFrame(nameRowRDD, structType); //注册names表 namesDF.registerTempTable("names"); /** * Function可以使用UDF1到UDF22/21?,所表达的意思就是几个参数,2代指两个入参,10代指10个入参 * return返回的即为UDF<>的最后一个参数, */ sqlContext.udf().register("strLen", new UDF1<String, Integer>() { public Integer call(String s) throws Exception { return s.length(); } },DataTypes.IntegerType); List<Row> rows = sqlContext.sql("select name,strLen(name) from names").javaRDD().collect(); for (Row row : rows) { System.out.println("name:"+row.get(0)+",长度:"+row.get(1)); } sc.close(); } }