import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{IntegerType, StringType, StructType}
object NewsAnalysisComplete {
def main(args: Array[String]): Unit = {
// 创建SparkSession(本地模式)
val spark = SparkSession.builder()
.appName("NewsAnalysisComplete")
.master("local[*]")
.config("spark.sql.adaptive.enabled", "true")
.getOrCreate()
// 定义明确的schema
val schema = new StructType()
.add("title", StringType, true)
.add("url", StringType, true)
.add("publish_date", StringType, true)
.add("view_count", IntegerType, true)
// 从CSV文件读取数据
val df = spark.read
.option("header", "true")
.schema(schema)
.csv("D:\\sparkProject_syq\\input\\ujn_news.csv")
// 数据清洗:去除可能存在的空值
val cleanedDF = df.na.drop()
// 打印基本信息
val totalCount = cleanedDF.count()
println(s"清洗后数据总条数: $totalCount")
val originalCount = df.count()
println(s"原始数据总条数: $originalCount")
println(s"过滤掉的数据条数: ${originalCount - totalCount}")
val dateRange = cleanedDF.agg(min("publish_date"), max("publish_date")).collect()(0)
println(s"日期范围: ${dateRange.get(0)} 到 ${dateRange.get(1)}")
// 1. 统计每日发布量
val dailyCount = cleanedDF.groupBy("publish_date")
.agg(
count("*").as("daily_count"),
sum("view_count").as("daily_total_views"),
avg("view_count").as("avg_views_per_article")
)
.orderBy(desc("publish_date"))
// 2. 每篇新闻的浏览量排名
val newsViews = cleanedDF.select("title", "publish_date", "view_count")
.orderBy(desc("view_count"))
// 3. 额外统计:总浏览量和平均浏览量
val totalStats = cleanedDF.agg(
sum("view_count").as("total_views"),
avg("view_count").as("average_views"),
count("*").as("total_articles")
)
// 显示结果
println("\n每日新闻发布量统计:")
dailyCount.show()
println("\n浏览量:")
newsViews.show()
println("\n总体统计:")
totalStats.show()
// 保存结果到文件,每个输出目录只有一个文件
dailyCount.coalesce(1)
.write
.option("header", "true")
.mode("overwrite")
.csv("D:\\sparkProject_syq\\output\\daily_count")
newsViews.coalesce(1)
.write
.option("header", "true")
.mode("overwrite")
.csv("D:\\sparkProject_syq\\output\\news_views")
totalStats.coalesce(1)
.write
.option("header", "true")
.mode("overwrite")
.csv("D:\\sparkProject_syq\\output\\total_stats")
// 创建一个包含所有结果的综合报告 - 使用更简单的方法
import spark.implicits._
// 首先将每个DataFrame转换为字符串表示
val dailyStatsString = dailyCount.collect().map(row =>
s"每日统计: 日期=${row.getAs[String]("publish_date")}, 发布量=${row.getAs[Long]("daily_count")}, " +
s"总浏览量=${row.getAs[Long]("daily_total_views")}, 平均浏览量=${row.getAs[Double]("avg_views_per_article")}"
)
val newsStatsString = newsViews.collect().map(row =>
s"新闻详情: 标题=${row.getAs[String]("title")}, 日期=${row.getAs[String]("publish_date")}, " +
s"浏览量=${row.getAs[Int]("view_count")}"
)
val totalStatsString = totalStats.collect().map(row =>
s"总体统计: 总浏览量=${row.getAs[Long]("total_views")}, 平均浏览量=${row.getAs[Double]("average_views")}, " +
s"总文章数=${row.getAs[Long]("total_articles")}"
)
// 将所有统计结果合并到一个RDD中
val allStats = spark.sparkContext.parallelize(
dailyStatsString ++ newsStatsString ++ totalStatsString
)
// 将合并的结果保存到单个文件
allStats.coalesce(1)
.saveAsTextFile("D:\\sparkProject_syq\\output\\all_results")
println("分析完成,结果已保存到输出目录")
// 停止SparkSession
spark.stop()
}
}上述代码改成 NewsAnalysisCompleteSpark类,要求可以在Spark集群内运行,输入是hdfs里的csv文件,输出是目录