[MapReduce]Filter Pattern

本博客详细介绍了Udacity的课程练习,内容涉及从大量的论坛帖子中筛选出只有一句话的帖子的过程。通过使用Python编程语言和csv模块,实现对数据文件进行解析并计数符合条件的帖子数量。

这是Udacity的课程 intro to hadoop and mapReduce里面Lesson4的练习


该练习完成了从大量的论坛post中,过滤出只有一句话的post。以下是题目具体描述以及python代码

#!/usr/bin/python
import sys
import csv

# To run this code on the actual data, please download the additional dataset.
# You can find instructions in the course materials (wiki) and in the instructor notes.
# There are some things in this data file that are different from what you saw
# in Lesson 3. The dataset is more complicated and closer to what you might
# see in the real world. It was generated by exporting data from a SQL database.
# 
# The data in at least one of the fields (the body field) can include newline
# characters, and all the fields are enclosed in double quotes. Therefore, we
# will need to process the data file in a way other than using split(","). To do this, 
# we have provided sample code for using the csv module of Python. Each 'line'
# will be a list that contains each field in sequential order.
# 
# In this exercise, we are interested in the field 'body' (which is the 5th field, 
# line[4]). The objective is to count the number of forum nodes where 'body' either 
# contains none of the three punctuation marks: period ('.'), exclamation point ('!'), 
# question mark ('?'), or else 'body' contains exactly one such punctuation mark as the 
# last character. There is no need to parse the HTML inside 'body'. Also, do not pay
# special attention to newline characters.

punctualSet = ['.', '!', '?']

def mapper():
    reader = csv.reader(sys.stdin, delimiter='\t')
    writer = csv.writer(sys.stdout, delimiter='\t', quotechar='"', quoting=csv.QUOTE_ALL)

    for line in reader:

        # YOUR CODE HERE
        if one_sentence(line[4]):
            writer.writerow(line)
            
def one_sentence(body):
    tokens = [".", "!", "?"]
    if body[-1] in tokens:
        body = body[:-1]
    if containsAny(body, tokens):
        return False
    # for token in tokens:
    #     if len(body.strip().split(token)) > 1:
    #         return False
    return True

def containsAny(body, tokens):
    """Check whether 'str' contains ANY of the chars in 'set'"""
    return 1 in [c in body for c in tokens]

test_text = """\"\"\t\"\"\t\"\"\t\"\"\t\"This is one sentence\"\t\"\"
\"\"\t\"\"\t\"\"\t\"\"\t\"Also one sentence!\"\t\"\"
\"\"\t\"\"\t\"\"\t\"\"\t\"Hey!\nTwo sentences!\"\t\"\"
\"\"\t\"\"\t\"\"\t\"\"\t\"One. Two! Three?\"\t\"\"
\"\"\t\"\"\t\"\"\t\"\"\t\"One Period. Two Sentences\"\t\"\"
\"\"\t\"\"\t\"\"\t\"\"\t\"Three\nlines, one sentence\n\"\t\"\"
"""

# This function allows you to test the mapper with the provided test string
def main():
    import StringIO
    sys.stdin = StringIO.StringIO(test_text)
    mapper()
    sys.stdin = sys.__stdin__

if __name__ == "__main__":
    main()

package cn.itcast.mr.dedup; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; public class ParallelFPGrowth { public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); // 配置Hadoop集群连接 conf.setInt("ipc.maximum.data.length", 2000000000); conf.setInt("ipc.maximum.response.length", 2000000000); conf.setInt("dfs.client.socket-timeout", 1200000); conf.setBoolean("mapreduce.map.output.compress", true); conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec"); conf.set("fs.defaultFS", "hdfs://192.168.88.101:8020"); // PFP算法参数配置 conf.setFloat("pfp.min.support", 0.05f); // 最小支持度 conf.setInt("pfp.num.groups", 10); // 分组数量 conf.setInt("pfp.max.heap.size", 50); // 最大堆大小 // 直接指定输入和输出路径 String inputPath = "hdfs://192.168.88.101:8020/input1"; String outputPath = "hdfs://192.168.88.101:8020/output2"; // 第一阶段:计算频繁项 Job job1 = Job.getInstance(conf, "PFP Pass 1"); job1.setJarByClass(ParallelFPGrowth.class); job1.setMapperClass(PFPMapper.Pass1Mapper.class); job1.setReducerClass(PFPReducer.Pass1Reducer.class); job1.setOutputKeyClass(Text.class); job1.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job1, new Path(inputPath)); FileOutputFormat.setOutputPath(job1, new Path(outputPath + "/pass1")); if (!job1.waitForCompletion(true)) { System.exit(1); } // 第二阶段:并行FP - Growth Job job2 = Job.getInstance(conf, "PFP Pass 2"); job2.setJarByClass(ParallelFPGrowth.class); job2.setMapperClass(PFPMapper.Pass2Mapper.class); job2.setReducerClass(PFPReducer.Pass2Reducer.class); job2.setOutputKeyClass(Text.class); job2.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job2, new Path(inputPath)); FileOutputFormat.setOutputPath(job2, new Path(outputPath + "/pass2")); System.exit(job2.waitForCompletion(true)? 0 : 1); } };package cn.itcast.mr.dedup; import java.util.List; public class Pattern { private List items; private int support; public Pattern(List<String> items, int support) { this.items = items; this.support = support; } public List<String> getItems() { return items; } public int getSupport() { return support; } @Override public String toString() { return items.toString() + " (" + support + ")"; } };package cn.itcast.mr.dedup; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; import java.util.ArrayList; import java.util.List; public class PFPMapper { // 第一阶段Mapper:计算项频次 public static class Pass1Mapper extends Mapper<LongWritable, Text, Text, IntWritable> { private final static IntWritable one = new IntWritable(1); private Text item = new Text(); @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] items = value.toString().split(" "); for (String i : items) { // 只保留字母组成的标签(过滤标点、数字等) if (i.matches("[a-zA-Z]+")) { item.set(i); context.write(item, one); } } } } // 第二阶段Mapper:分组处理事务 public static class Pass2Mapper extends Mapper<LongWritable, Text, Text, Text> { private Text groupId = new Text(); private Text transaction = new Text(); @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] items = value.toString().split("\\s+"); if (items.length == 0) return; // 根据第一个项的哈希值决定分组 int group = Math.abs(items[0].hashCode()) % context.getConfiguration().getInt("pfp.num.groups", 10); groupId.set(String.valueOf(group)); // 构建事务字符串 StringBuilder sb = new StringBuilder(); for (String item : items) { sb.append(item).append(" "); } transaction.set(sb.toString().trim()); context.write(groupId, transaction); } } };package cn.itcast.mr.dedup; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Map; public class PFPReducer { // 第一阶段Reducer:汇总项频次 public static class Pass1Reducer extends Reducer<Text, org.apache.hadoop.io.IntWritable, Text, org.apache.hadoop.io.IntWritable> { private org.apache.hadoop.io.IntWritable result = new org.apache.hadoop.io.IntWritable(); @Override protected void reduce(Text key, Iterable<org.apache.hadoop.io.IntWritable> values, Context context) throws IOException, InterruptedException { int sum = 0; for (org.apache.hadoop.io.IntWritable val : values) { sum += val.get(); } result.set(sum); context.write(key, result); } } // 第二阶段Reducer:执行优化后的FP-Growth算法 public static class Pass2Reducer extends Reducer<Text, Text, Text, Text> { private Text result = new Text(); private int numThreads; @Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); // 从配置中获取线程数,默认为可用处理器数量 numThreads = context.getConfiguration().getInt("pfp.num.threads", Runtime.getRuntime().availableProcessors()); } @Override protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { // 收集本组所有事务 List<List<String>> transactions = new ArrayList<>(); for (Text val : values) { String[] items = val.toString().split(" "); transactions.add(Arrays.asList(items)); } // 获取最小支持度 float minSupport = context.getConfiguration().getFloat("pfp.min.support", 0.05f); // 创建FPGrowth实例,使用多线程优化 FPGrowth fpGrowth = new FPGrowth(minSupport, numThreads); Map<List<String>, Integer> frequentPatterns = fpGrowth.findFrequentPatterns(transactions); // 输出频繁项集 for (Map.Entry<List<String>, Integer> entry : frequentPatterns.entrySet()) { StringBuilder patternStr = new StringBuilder(); for (String item : entry.getKey()) { patternStr.append(item).append(" "); } result.set(patternStr.toString().trim() + " (" + entry.getValue() + ")"); context.write(key, result); } } } };package cn.itcast.mr.dedup; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.LineRecordReader; import java.io.IOException; public class TransactionReader extends RecordReader<Text, Text> { private LineRecordReader lineRecordReader; private Text key; private Text value; public TransactionReader() { lineRecordReader = new LineRecordReader(); } @Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { lineRecordReader.initialize(split, context); } @Override public boolean nextKeyValue() throws IOException, InterruptedException { if (!lineRecordReader.nextKeyValue()) { return false; } // 使用行号作为key,行内容作为value if (key == null) { key = new Text(); } if (value == null) { value = new Text(); } key.set(String.valueOf(lineRecordReader.getCurrentKey().get())); value.set(lineRecordReader.getCurrentValue()); return true; } @Override public Text getCurrentKey() throws IOException, InterruptedException { return key; } @Override public Text getCurrentValue() throws IOException, InterruptedException { return value; } @Override public float getProgress() throws IOException, InterruptedException { return lineRecordReader.getProgress(); } @Override public void close() throws IOException { lineRecordReader.close(); } };package cn.itcast.mr.dedup; import java.util.HashMap; import java.util.Map; class TreeNode { String item; int count; TreeNode parent; Map<String, TreeNode> children; TreeNode nodeLink; public TreeNode(String item, int count, TreeNode parent) { this.item = item; this.count = count; this.parent = parent; this.children = new HashMap<>(); this.nodeLink = null; } public void increment(int count) { this.count += count; } };package cn.itcast.mr.dedup; import java.util.; import java.util.concurrent.; import java.util.stream.Collectors; /** FP-Growth算法实现类,支持并行处理以提高性能 */ public class FPGrowth { private final float minSupport; private final int numThreads; private final ExecutorService executorService; public FPGrowth(float minSupport) { this(minSupport, Runtime.getRuntime().availableProcessors()); } public FPGrowth(float minSupport, int numThreads) { this.minSupport = minSupport; this.numThreads = numThreads; this.executorService = Executors.newFixedThreadPool(numThreads); } public Map<List, Integer> findFrequentPatterns(List<List> transactions) { if (transactions.isEmpty()) { return Collections.emptyMap(); } // 第一阶段:统计项的全局频率 Map<String, Integer> globalItemCounts = countItems(transactions); // 计算最小支持度计数 int minCount = (int) Math.ceil(minSupport * transactions.size()); // 过滤并排序频繁项 List<String> frequentItems = globalItemCounts.entrySet().stream() .filter(e -> e.getValue() >= minCount) .sorted(Map.Entry.<String, Integer>comparingByValue(Comparator.reverseOrder()) .thenComparing(Map.Entry.comparingByKey())) .map(Map.Entry::getKey) .collect(Collectors.toList()); // 构建项到索引的映射,提高查找效率 Map<String, Integer> itemIndexMap = new HashMap<>(); for (int i = 0; i < frequentItems.size(); i++) { itemIndexMap.put(frequentItems.get(i), i); } // 第二阶段:构建FP树 TreeNode root = buildFPTree(transactions, itemIndexMap, minCount); // 第三阶段:挖掘频繁模式 Map<List<String>, Integer> frequentPatterns = new ConcurrentHashMap<>(); try { // 为每个频繁项创建任务 List<Callable<Void>> tasks = new ArrayList<>(); for (String item : frequentItems) { tasks.add(() -> { List<String> prefixPath = Collections.singletonList(item); Map<List<String>, Integer> conditionalPatterns = findPatternsInConditionalTree(root, item, itemIndexMap, minCount); // 将条件模式添加到结果中 for (Map.Entry<List<String>, Integer> entry : conditionalPatterns.entrySet()) { List<String> pattern = new ArrayList<>(prefixPath); pattern.addAll(entry.getKey()); frequentPatterns.put(pattern, entry.getValue()); } // 添加单个项的支持度 frequentPatterns.put(prefixPath, globalItemCounts.get(item)); return null; }); } // 执行所有任务 executorService.invokeAll(tasks); } catch (InterruptedException e) { Thread.currentThread().interrupt(); throw new RuntimeException("挖掘频繁模式时被中断", e); } finally { executorService.shutdown(); } return frequentPatterns; } private Map<String, Integer> countItems(List<List> transactions) { Map<String, Integer> itemCounts = new ConcurrentHashMap<>(); // 并行统计项的频率 transactions.parallelStream().forEach(transaction -> { for (String item : transaction) { itemCounts.compute(item, (k, v) -> v == null ? 1 : v + 1); } }); return itemCounts; } private TreeNode buildFPTree(List<List> transactions, Map<String, Integer> itemIndexMap, int minCount) { // 创建根节点 TreeNode root = new TreeNode(“Null”, 0, null); // 构建项头表 Map<String, TreeNode> headerTable = new HashMap<>(); // 并行处理事务 List<Runnable> tasks = new ArrayList<>(); for (List<String> transaction : transactions) { // 过滤并排序事务中的项 List<String> filteredItems = transaction.stream() .filter(itemIndexMap::containsKey) .sorted(Comparator.comparingInt(itemIndexMap::get)) .collect(Collectors.toList()); if (!filteredItems.isEmpty()) { tasks.add(() -> insertTransaction(filteredItems, root, headerTable)); } } // 并行执行插入任务 executeTasksInParallel(tasks); return root; } private void insertTransaction(List items, TreeNode root, Map<String, TreeNode> headerTable) { TreeNode currentNode = root; for (String item : items) { // 检查子节点中是否已存在该项 TreeNode child = currentNode.children.get(item); if (child == null) { // 创建新节点 child = new TreeNode(item, 1, currentNode); currentNode.children.put(item, child); // 更新头表 updateHeaderTable(item, child, headerTable); } else { // 节点已存在,增加计数 child.increment(1); } currentNode = child; } } private void updateHeaderTable(String item, TreeNode node, Map<String, TreeNode> headerTable) { synchronized (headerTable) { if (!headerTable.containsKey(item)) { headerTable.put(item, node); } else { // 找到链表末尾并添加新节点 TreeNode tail = headerTable.get(item); while (tail.nodeLink != null) { tail = tail.nodeLink; } tail.nodeLink = node; } } } private Map<List, Integer> findPatternsInConditionalTree( TreeNode root, String item, Map<String, Integer> itemIndexMap, int minCount) { // 收集所有条件模式基 List<List<String>> conditionalPatternBases = new ArrayList<>(); // 从项头表获取该项目的所有节点 TreeNode node = findNodeInHeaderTable(root, item); while (node != null) { // 构建前缀路径 List<String> prefixPath = buildPrefixPath(node); if (!prefixPath.isEmpty()) { // 前缀路径重复出现的次数等于当前节点的计数 for (int i = 0; i < node.count; i++) { conditionalPatternBases.add(new ArrayList<>(prefixPath)); } } node = node.nodeLink; } // 如果没有条件模式基,直接返回 if (conditionalPatternBases.isEmpty()) { return Collections.emptyMap(); } // 递归挖掘条件FP树 return findFrequentPatterns(conditionalPatternBases); } private TreeNode findNodeInHeaderTable(TreeNode root, String item) { // 这里简化处理,实际实现需要遍历头表 // 为了代码简洁,假设根节点的子节点包含头表信息 return root.children.get(item); } private List buildPrefixPath(TreeNode node) { List path = new ArrayList<>(); TreeNode current = node.parent; while (current != null && current.item != null) { path.add(0, current.item); current = current.parent; } return path; } private void executeTasksInParallel(List tasks) { // 使用线程池并行执行任务 List<Future<?>> futures = new ArrayList<>(); for (Runnable task : tasks) { futures.add(executorService.submit(task)); } // 等待所有任务完成 for (Future<?> future : futures) { try { future.get(); } catch (Exception e) { throw new RuntimeException("执行任务时出错", e); } } } }以上代码运行时出现以下错误:17:32:52.294 [main] DEBUG org.apache.hadoop.security.UserGroupInformation - PrivilegedAction [as: С (auth:SIMPLE)][action: org.apache.hadoop.mapreduce.Job$ 6@1229a2b7] java.lang.Exception: null at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1950) at org.apache.hadoop.mapreduce.Job.getTaskCompletionEvents(Job.java:730) at org.apache.hadoop.mapreduce.Job.monitorAndPrintJob(Job.java:1759) at org.apache.hadoop.mapreduce.Job.waitForCompletion(Job.java:1698) at cn.itcast.mr.dedup.ParallelFPGrowth.main(ParallelFPGrowth.java:57) 17:32:52.294 [main] DEBUG org.apache.hadoop.security.UserGroupInformation - PrivilegedAction [as: С (auth:SIMPLE)][action: org.apache.hadoop.mapreduce.Job$ 1@e5cbff2] java.lang.Exception: null at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1950) at org.apache.hadoop.mapreduce.Job.updateStatus(Job.java:329) at org.apache.hadoop.mapreduce.Job.isComplete(Job.java:613) at org.apache.hadoop.mapreduce.Job.monitorAndPrintJob(Job.java:1736) at org.apache.hadoop.mapreduce.Job.waitForCompletion(Job.java:1698) at cn.itcast.mr.dedup.ParallelFPGrowth.main(ParallelFPGrowth.java:57)但是我的集群运行正常,路径上的output文件夹已删,并且我是直接在IDEA上面运行的,怎么修改比较好
最新发布
06-27
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值