from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
spark = SparkSession.builder.appName('face_history').enableHiveSupport().getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(spark)
path_cluster="/home/202004*"
df=sqlContext.read.json(path_cluster)
df_=df.select(df.data.user_id)
print(df_.printSchema()) #data.user_id
#必须重命名,不然后面访问df_.data.user_id的话会提示df没有data属性
df_=df_.withColumnRenamed("data.user_id","user_id")
print(df_.count())
#通配符统计user_id字段以'800'、'700'开头的数量
print(df_.filter(df_.user_id.startswith('800')).count())
print(df_.filter(df_.user_id.startswith('700')).count())