实现的需求:
1. 读csv将Tiler中含有T的数据过滤出来。
2. 创建flag列,将201903>201902时flag值为up否则为down
3. 将结果插入到MySql数据库。
废话不多说,直接上代码:
--------------------------------------------------------------------------------
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import java.util.HashMap;
import java.util.Properties;
public class SparkCsvDemo {
public static void main(String[] args) {
String hdfsInAddress = "D:\\DevTemp\\AWS\\";//"hdfs://192.168.209.129:9000/"; //server ip
String inputAddress = "";//"in/";
String csvFileName="emr-demo-data.csv";
SparkConf conf = new SparkConf().setMaster("local").setAppName("TestSpark");
System.out.println("==================");
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(sc);
HashMap<String,String> options = new HashMap<String,String> ();
options.put("header", "true");//设置第一行为头
options.put("inferSchema", "true");//设置自动分析片段类型
//options.put("path", hdfsInAddress + inputAddress + filePath);
options.put("path", this.hdfsInAddress + this.inputAddress + this.csvFileName);
options.put("dateFormat","YYYY-MM-DD");
System.out.println("打印上传文件在hdfs的路径:"+hdfsInAddress + inputAddress + csvFileName);
/****声明字段类型****/
StructField structFields[] = new StructField[9];
structFields[0] = DataTypes.createStructField("Tier", DataTypes.StringType,true);
structFields[1] = DataTypes.createStructField("SellerCode",DataTypes.StringType,true);
structFields[2] = DataTypes.createStructField("SellerName",DataTypes.StringType,true);
structFields[3] = DataTypes.createStructField("DataSource",DataTypes.StringType,true);
structFields[4] = DataTypes.createStructField("SellerProvince",DataTypes.StringType,true);
structFields[5] = DataTypes.createStructField("_201901",DataTypes.DoubleType,true);
structFields[6] = DataTypes.createStructField("_201902",DataTypes.DoubleType,true);
structFields[7] = DataTypes.createStructField("_201903",DataTypes.DoubleType,true);
structFields[8] = DataTypes.createStructField("flag",DataTypes.StringType,true);
StructType structType = new StructType(structFields);
Dataset dataFrame = sqlContext.load("com.databricks.spark.csv", structType, options);
// DataFrame cars = (new CsvParser()).withUseHeader(true).csvFile(sqlContext, "cars.csv");//通过CsvParser里面的函数来读取CSV文件
dataFrame.registerTempTable("result");
StringBuffer sparkSql = new StringBuffer("select ");
sparkSql.append("Tier");
sparkSql.append(", SellerCode");
sparkSql.append(", SellerName");
sparkSql.append(", DataSource");
sparkSql.append(", SellerProvince");
sparkSql.append(", _201901");
sparkSql.append(", _201902");
sparkSql.append(", _201903");
sparkSql.append(", if(_201903>_201902,'up','down') as flag");
sparkSql.append(" from result");
Dataset resultFrame=sqlContext.sql(sparkSql.toString() );
//resultFrame.createOrReplaceTempView("resultView");//创建视图
//System.out.println("***************用Dataset打印*peopleScore********"+resultFrame.limit(10).showString(20,0,false));
System.out.println("******print schema *******");
resultFrame.printSchema();
System.out.println("*************");
//resultFrame.select("SellerName").show();
System.out.println("*************");
//Tier SellerCode SellerName DataSource SellerProvince _201901 _201902 _201903
Dataset df = resultFrame.select(
resultFrame.col("Tier"),
resultFrame.col("SellerCode"),
resultFrame.col("SellerName"),
resultFrame.col("DataSource"),
resultFrame.col("SellerProvince"),
resultFrame.col("_201901"),
resultFrame.col("_201902"),
resultFrame.col("_201903"),
resultFrame.col("flag"));
df = df.filter(df.col("Tier").contains("T"));//where condition:equalTo/
//df = df.filter((df.col("_201902").cast(DataTypes.FloatType)).gt((df.col("201901").cast(DataTypes.FloatType))));//gt 大于
//df = df.orderBy(df.col("_201902").cast(DataTypes.FloatType).asc_nulls_first());//转换类型并升序
//df.groupBy("age").count();//分组
df.show();
/*************将结果写入到 mysql 数据库******************/
//数据库连接
String url = "jdbc:mysql://127.0.0.1:3306/hive?useUnicode=true&characterEncoding=utf-8";
Properties connectionProperties = new Properties();
connectionProperties.put("user","root");
connectionProperties.put("password","123456");
connectionProperties.put("driver","com.mysql.jdbc.Driver");
/**插入数据库表中**/
df.write().mode(SaveMode.Overwrite).jdbc(url,"t_result",connectionProperties);//Overwrite会覆盖数据和表结构
sc.stop();
}
}
-------------------------------------------------------------
测试用csv文件
插入MySql结果:
以上测试结果好用。由于摸索的实现,没有将方法提取,需要的小朋友自己修改吧。