package cn.itcast.mr.dedup;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class ParallelFPGrowth {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
// 配置Hadoop集群连接 conf.setInt("ipc.maximum.data.length", 2000000000); conf.setInt("ipc.maximum.response.length", 2000000000); conf.setInt("dfs.client.socket-timeout", 1200000); conf.setBoolean("mapreduce.map.output.compress", true); conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec"); conf.set("fs.defaultFS", "hdfs://192.168.88.101:8020"); // PFP算法参数配置 conf.setFloat("pfp.min.support", 0.05f); // 最小支持度 conf.setInt("pfp.num.groups", 10); // 分组数量 conf.setInt("pfp.max.heap.size", 50); // 最大堆大小 // 直接指定输入和输出路径 String inputPath = "hdfs://192.168.88.101:8020/input1"; String outputPath = "hdfs://192.168.88.101:8020/output2"; // 第一阶段:计算频繁项 Job job1 = Job.getInstance(conf, "PFP Pass 1"); job1.setJarByClass(ParallelFPGrowth.class); job1.setMapperClass(PFPMapper.Pass1Mapper.class); job1.setReducerClass(PFPReducer.Pass1Reducer.class); job1.setOutputKeyClass(Text.class); job1.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job1, new Path(inputPath)); FileOutputFormat.setOutputPath(job1, new Path(outputPath + "/pass1")); if (!job1.waitForCompletion(true)) { System.exit(1); } // 第二阶段:并行FP - Growth Job job2 = Job.getInstance(conf, "PFP Pass 2"); job2.setJarByClass(ParallelFPGrowth.class); job2.setMapperClass(PFPMapper.Pass2Mapper.class); job2.setReducerClass(PFPReducer.Pass2Reducer.class); job2.setOutputKeyClass(Text.class); job2.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job2, new Path(inputPath)); FileOutputFormat.setOutputPath(job2, new Path(outputPath + "/pass2")); System.exit(job2.waitForCompletion(true)? 0 : 1); }
};package cn.itcast.mr.dedup;
import java.util.List;
public class Pattern {
private List items;
private int support;
public Pattern(List<String> items, int support) { this.items = items; this.support = support; } public List<String> getItems() { return items; } public int getSupport() { return support; } @Override public String toString() { return items.toString() + " (" + support + ")"; }
};package cn.itcast.mr.dedup;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class PFPMapper {
// 第一阶段Mapper:计算项频次
public static class Pass1Mapper extends Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text item = new Text();
@Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] items = value.toString().split(" "); for (String i : items) { // 只保留字母组成的标签(过滤标点、数字等) if (i.matches("[a-zA-Z]+")) { item.set(i); context.write(item, one); } } } } // 第二阶段Mapper:分组处理事务 public static class Pass2Mapper extends Mapper<LongWritable, Text, Text, Text> { private Text groupId = new Text(); private Text transaction = new Text(); @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] items = value.toString().split("\\s+"); if (items.length == 0) return; // 根据第一个项的哈希值决定分组 int group = Math.abs(items[0].hashCode()) % context.getConfiguration().getInt("pfp.num.groups", 10); groupId.set(String.valueOf(group)); // 构建事务字符串 StringBuilder sb = new StringBuilder(); for (String item : items) { sb.append(item).append(" "); } transaction.set(sb.toString().trim()); context.write(groupId, transaction); } }
};package cn.itcast.mr.dedup;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
public class PFPReducer {
// 第一阶段Reducer:汇总项频次
public static class Pass1Reducer extends Reducer<Text, org.apache.hadoop.io.IntWritable, Text, org.apache.hadoop.io.IntWritable> {
private org.apache.hadoop.io.IntWritable result = new org.apache.hadoop.io.IntWritable();
@Override protected void reduce(Text key, Iterable<org.apache.hadoop.io.IntWritable> values, Context context) throws IOException, InterruptedException { int sum = 0; for (org.apache.hadoop.io.IntWritable val : values) { sum += val.get(); } result.set(sum); context.write(key, result); } } // 第二阶段Reducer:执行优化后的FP-Growth算法 public static class Pass2Reducer extends Reducer<Text, Text, Text, Text> { private Text result = new Text(); private int numThreads; @Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); // 从配置中获取线程数,默认为可用处理器数量 numThreads = context.getConfiguration().getInt("pfp.num.threads", Runtime.getRuntime().availableProcessors()); } @Override protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { // 收集本组所有事务 List<List<String>> transactions = new ArrayList<>(); for (Text val : values) { String[] items = val.toString().split(" "); transactions.add(Arrays.asList(items)); } // 获取最小支持度 float minSupport = context.getConfiguration().getFloat("pfp.min.support", 0.05f); // 创建FPGrowth实例,使用多线程优化 FPGrowth fpGrowth = new FPGrowth(minSupport, numThreads); Map<List<String>, Integer> frequentPatterns = fpGrowth.findFrequentPatterns(transactions); // 输出频繁项集 for (Map.Entry<List<String>, Integer> entry : frequentPatterns.entrySet()) { StringBuilder patternStr = new StringBuilder(); for (String item : entry.getKey()) { patternStr.append(item).append(" "); } result.set(patternStr.toString().trim() + " (" + entry.getValue() + ")"); context.write(key, result); } } }
};package cn.itcast.mr.dedup;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
import java.io.IOException;
public class TransactionReader extends RecordReader<Text, Text> {
private LineRecordReader lineRecordReader;
private Text key;
private Text value;
public TransactionReader() { lineRecordReader = new LineRecordReader(); } @Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { lineRecordReader.initialize(split, context); } @Override public boolean nextKeyValue() throws IOException, InterruptedException { if (!lineRecordReader.nextKeyValue()) { return false; } // 使用行号作为key,行内容作为value if (key == null) { key = new Text(); } if (value == null) { value = new Text(); } key.set(String.valueOf(lineRecordReader.getCurrentKey().get())); value.set(lineRecordReader.getCurrentValue()); return true; } @Override public Text getCurrentKey() throws IOException, InterruptedException { return key; } @Override public Text getCurrentValue() throws IOException, InterruptedException { return value; } @Override public float getProgress() throws IOException, InterruptedException { return lineRecordReader.getProgress(); } @Override public void close() throws IOException { lineRecordReader.close(); }
};package cn.itcast.mr.dedup;
import java.util.HashMap;
import java.util.Map;
class TreeNode {
String item;
int count;
TreeNode parent;
Map<String, TreeNode> children;
TreeNode nodeLink;
public TreeNode(String item, int count, TreeNode parent) { this.item = item; this.count = count; this.parent = parent; this.children = new HashMap<>(); this.nodeLink = null; } public void increment(int count) { this.count += count; }
};package cn.itcast.mr.dedup;
import java.util.;
import java.util.concurrent.;
import java.util.stream.Collectors;
/**
FP-Growth算法实现类,支持并行处理以提高性能
*/
public class FPGrowth {
private final float minSupport;
private final int numThreads;
private final ExecutorService executorService;
public FPGrowth(float minSupport) {
this(minSupport, Runtime.getRuntime().availableProcessors());
}
public FPGrowth(float minSupport, int numThreads) {
this.minSupport = minSupport;
this.numThreads = numThreads;
this.executorService = Executors.newFixedThreadPool(numThreads);
}
public Map<List, Integer> findFrequentPatterns(List<List> transactions) {
if (transactions.isEmpty()) {
return Collections.emptyMap();
}
// 第一阶段:统计项的全局频率 Map<String, Integer> globalItemCounts = countItems(transactions); // 计算最小支持度计数 int minCount = (int) Math.ceil(minSupport * transactions.size()); // 过滤并排序频繁项 List<String> frequentItems = globalItemCounts.entrySet().stream() .filter(e -> e.getValue() >= minCount) .sorted(Map.Entry.<String, Integer>comparingByValue(Comparator.reverseOrder()) .thenComparing(Map.Entry.comparingByKey())) .map(Map.Entry::getKey) .collect(Collectors.toList()); // 构建项到索引的映射,提高查找效率 Map<String, Integer> itemIndexMap = new HashMap<>(); for (int i = 0; i < frequentItems.size(); i++) { itemIndexMap.put(frequentItems.get(i), i); } // 第二阶段:构建FP树 TreeNode root = buildFPTree(transactions, itemIndexMap, minCount); // 第三阶段:挖掘频繁模式 Map<List<String>, Integer> frequentPatterns = new ConcurrentHashMap<>(); try { // 为每个频繁项创建任务 List<Callable<Void>> tasks = new ArrayList<>(); for (String item : frequentItems) { tasks.add(() -> { List<String> prefixPath = Collections.singletonList(item); Map<List<String>, Integer> conditionalPatterns = findPatternsInConditionalTree(root, item, itemIndexMap, minCount); // 将条件模式添加到结果中 for (Map.Entry<List<String>, Integer> entry : conditionalPatterns.entrySet()) { List<String> pattern = new ArrayList<>(prefixPath); pattern.addAll(entry.getKey()); frequentPatterns.put(pattern, entry.getValue()); } // 添加单个项的支持度 frequentPatterns.put(prefixPath, globalItemCounts.get(item)); return null; }); } // 执行所有任务 executorService.invokeAll(tasks); } catch (InterruptedException e) { Thread.currentThread().interrupt(); throw new RuntimeException("挖掘频繁模式时被中断", e); } finally { executorService.shutdown(); } return frequentPatterns;
}
private Map<String, Integer> countItems(List<List> transactions) {
Map<String, Integer> itemCounts = new ConcurrentHashMap<>();
// 并行统计项的频率 transactions.parallelStream().forEach(transaction -> { for (String item : transaction) { itemCounts.compute(item, (k, v) -> v == null ? 1 : v + 1); } }); return itemCounts;
}
private TreeNode buildFPTree(List<List> transactions,
Map<String, Integer> itemIndexMap,
int minCount) {
// 创建根节点
TreeNode root = new TreeNode(“Null”, 0, null);
// 构建项头表 Map<String, TreeNode> headerTable = new HashMap<>(); // 并行处理事务 List<Runnable> tasks = new ArrayList<>(); for (List<String> transaction : transactions) { // 过滤并排序事务中的项 List<String> filteredItems = transaction.stream() .filter(itemIndexMap::containsKey) .sorted(Comparator.comparingInt(itemIndexMap::get)) .collect(Collectors.toList()); if (!filteredItems.isEmpty()) { tasks.add(() -> insertTransaction(filteredItems, root, headerTable)); } } // 并行执行插入任务 executeTasksInParallel(tasks); return root;
}
private void insertTransaction(List items, TreeNode root, Map<String, TreeNode> headerTable) {
TreeNode currentNode = root;
for (String item : items) { // 检查子节点中是否已存在该项 TreeNode child = currentNode.children.get(item); if (child == null) { // 创建新节点 child = new TreeNode(item, 1, currentNode); currentNode.children.put(item, child); // 更新头表 updateHeaderTable(item, child, headerTable); } else { // 节点已存在,增加计数 child.increment(1); } currentNode = child; }
}
private void updateHeaderTable(String item, TreeNode node, Map<String, TreeNode> headerTable) {
synchronized (headerTable) {
if (!headerTable.containsKey(item)) {
headerTable.put(item, node);
} else {
// 找到链表末尾并添加新节点
TreeNode tail = headerTable.get(item);
while (tail.nodeLink != null) {
tail = tail.nodeLink;
}
tail.nodeLink = node;
}
}
}
private Map<List, Integer> findPatternsInConditionalTree(
TreeNode root, String item, Map<String, Integer> itemIndexMap, int minCount) {
// 收集所有条件模式基 List<List<String>> conditionalPatternBases = new ArrayList<>(); // 从项头表获取该项目的所有节点 TreeNode node = findNodeInHeaderTable(root, item); while (node != null) { // 构建前缀路径 List<String> prefixPath = buildPrefixPath(node); if (!prefixPath.isEmpty()) { // 前缀路径重复出现的次数等于当前节点的计数 for (int i = 0; i < node.count; i++) { conditionalPatternBases.add(new ArrayList<>(prefixPath)); } } node = node.nodeLink; } // 如果没有条件模式基,直接返回 if (conditionalPatternBases.isEmpty()) { return Collections.emptyMap(); } // 递归挖掘条件FP树 return findFrequentPatterns(conditionalPatternBases);
}
private TreeNode findNodeInHeaderTable(TreeNode root, String item) {
// 这里简化处理,实际实现需要遍历头表
// 为了代码简洁,假设根节点的子节点包含头表信息
return root.children.get(item);
}
private List buildPrefixPath(TreeNode node) {
List path = new ArrayList<>();
TreeNode current = node.parent;
while (current != null && current.item != null) { path.add(0, current.item); current = current.parent; } return path;
}
private void executeTasksInParallel(List tasks) {
// 使用线程池并行执行任务
List<Future<?>> futures = new ArrayList<>();
for (Runnable task : tasks) { futures.add(executorService.submit(task)); } // 等待所有任务完成 for (Future<?> future : futures) { try { future.get(); } catch (Exception e) { throw new RuntimeException("执行任务时出错", e); } }
}
}以上代码运行时出现以下错误:17:32:52.294 [main] DEBUG org.apache.hadoop.security.UserGroupInformation - PrivilegedAction [as: С (auth:SIMPLE)][action: org.apache.hadoop.mapreduce.Job$ 6@1229a2b7]
java.lang.Exception: null
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1950)
at org.apache.hadoop.mapreduce.Job.getTaskCompletionEvents(Job.java:730)
at org.apache.hadoop.mapreduce.Job.monitorAndPrintJob(Job.java:1759)
at org.apache.hadoop.mapreduce.Job.waitForCompletion(Job.java:1698)
at cn.itcast.mr.dedup.ParallelFPGrowth.main(ParallelFPGrowth.java:57)
17:32:52.294 [main] DEBUG org.apache.hadoop.security.UserGroupInformation - PrivilegedAction [as: С (auth:SIMPLE)][action: org.apache.hadoop.mapreduce.Job$ 1@e5cbff2]
java.lang.Exception: null
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1950)
at org.apache.hadoop.mapreduce.Job.updateStatus(Job.java:329)
at org.apache.hadoop.mapreduce.Job.isComplete(Job.java:613)
at org.apache.hadoop.mapreduce.Job.monitorAndPrintJob(Job.java:1736)
at org.apache.hadoop.mapreduce.Job.waitForCompletion(Job.java:1698)
at cn.itcast.mr.dedup.ParallelFPGrowth.main(ParallelFPGrowth.java:57)但是我的集群运行正常,路径上的output文件夹已删,并且我是直接在IDEA上面运行的,怎么修改比较好