1. Spark环境
2. carbondata_XXX.jar 包
3. carbon.properties配置文件:
#carbon.storelocation
carbon.storelocation=/tmp/carbon/carbonStore
#carbon.ddl.base.hdfs.url
carbon.ddl.base.hdfs.url=/tmp/carbon/data
#carbon.lock.type
carbon.lock.type=HDFSLOCK
4.
1. 启动
./bin/spark-shell --jars LOCAL_PATH/carbondata.jar
2. 引包:
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.CarbonSession._
3. 创建CarbonSession:
scala>
val carbon = SparkSession.builder().config(sc.getConf).getOrCreateCarbonSession("hdfs://ns9/tmp/carbon/carbonStore")
4. 执行sql语句:
创建表:
scala>
carbon.sql("create table test.carbon_test(id STRING, name STRING, city STRING, age INT) STORED BY 'carbondata'")
删除表:
scala>
carbon.sql("drop table test.carbon_test")
查询表数据:
scala>
carbon.sql("select * from test.carbon_test").show
5. 加载数据:
a. 创建sample数据:
本地创建文件:
cat >
sample.csv << EOF
id,name,city,age
1,david,shenzhen,31
2,eason,shenzhen,27
3,jarry,wuhan,35
EOF
b. put 到 HDFS:
hdfs dfs -put sample.csv /tmp/carbon/data/sample.csv
c. LOAD数据到表中:
scala>
carbon.sql("LOAD DATA INPATH 'hdfs://ns9/tmp/carbon/data/sample.csv' INTO TABLE test.carbon_test")
(文件的绝对路径)