private static void broadCastOps(JavaSparkContext sc) { //加载user表到rdd JavaRDD<String> linesRDD = sc.parallelize(Arrays.asList("1,3,张三,河北", "2,1,李四,北京", "3,0,王五,天津", "4,1,赵六,广东")); JavaRDD<String> sexLineRDD = sc.parallelize(Arrays.asList("1,男", "0,女")); /** * 将小表对应的RDD拉取到driver节点之上 * 使用合适的数据结构加载广播变量中 */ List<String> sexList = sexLineRDD.collect(); Map<String, String> sexMap = new HashMap(); for(String sexLine : sexList) { String[] sexSplits = sexLine.split(","); sexMap.put(sexSplits[0], sexSplits[1]); } Broadcast<Map<String, String>> sexMapBC = sc.broadcast(sexMap); JavaRDD<String> retRDD = linesRDD.map(new Function<String, String>() { @Override public String call(String line) throws Exception { String[] splits = line.split(","); if (splits == null || splits.length < 4) { return null; } String sid = splits[1].trim(); //不建议直接在transformation中调用外部变量,而应该从广播变量中获取外部变量 //String sName = sexMap.get(sid); String sName = sexMapBC.value().getOrDefault(sid, "未知"); return splits[0] + " " + sName + " " + splits[2] + " " + splits[3]; } }); retRDD.foreach(str -> System.out.println(str)); }
Spark之广播变量
最新推荐文章于 2025-06-09 08:38:34 发布