导出为csv
df.repartition(1).write.format("com.databricks.spark.csv").mode('overwrite').option("header", "true").save("/tmp/mydata.csv")
or
df.repartition(1).write.format("csv").save("/tmp/test.csv")
or
df.toPandas().to_csv("/tmp/test.csv")
spark dataframe和pandas dataframe相互转换
result = sqlContest.createDataFrame(pandasDF)
result.toPandas()
连接es
df = spark.read \
.format("org.elasticsearch.spark.sql") \
.option("es.nodes", "http://222.73.33.124:9200/") \
.option("path", "logstash-log-2018.04.18") \
.load()
参考文档:https://www.elastic.co/guide/en/elasticsearch/hadoop/master/configuration.html
添加自定义列
def getGameId(data_str):
if data_str[:8] == '[xxx11]':
return 2
if data_str[:5] == '[xxx22]':
return 1
return 0
getGameIdFunc = udf(getGameId, StringType())
df2 = df2.withColumn("gameId", getGameIdFunc(df2["event"]))
#添加一列常数
from pyspark.sql.functions import lit
df.withColumn("date", lit(111))
筛选列 并转换为int 并取别名
df2 = df2.select(from_unixtime(8 * 3600 + unix_timestamp(df2["@timestamp"])).alias("timestamp"), "event",df['n_maxGames'].cast('int').alias('maxGames'))
两个df求差集
diff = df.subtract(df2)
清除所有缓存
spark.catalog.clearCache()
读写数据库
result1.write.jdbc(url="jdbc:mysql://host/dbname"
"?user=root&password=xxx&rewriteBatchedStatements=true",
mode="append",
table="xxx",
properties={"driver": 'com.mysql.jdbc.Driver'})
#其中rewriteBatchedStatements表示使用insertmany
df2 = sqlContest.read.jdbc(url="jdbc:mysql://host/dbname"
"?user=xxx&password=xxx",
table="xxx",
properties={"driver": 'com.mysql.jdbc.Driver'})
创建临时表
result.createOrReplaceTempView("result")
创建dataframe
g_result=[[1,2,3],[2,3,4]]
result = sc.parallelize(g_result).toDF(['aa','bb','cc'])
spark sql随机抽取一定数量
%sql
select * from table order by rand() limit 1000