Hadoop-MultipleInputs实例<转>

本文介绍了一个Hadoop MapReduce程序示例,该程序能够处理两个不同类型的输入文件,并通过自定义的数据类型来标记数据源,最终在Reduce阶段整合数据,输出包含国家名称、编号及首都的信息。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

MR程序:

package org.forward.example.hadoop.multipleinputs;


import java.io.IOException;
import java.util.Iterator;


import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapred.lib.MultipleInputs;


public class Example_1
{
/**
* 处理File_1.txt 输入: 行号,行内容<br/>
* 输出: key=国家名, value=编号

* @author Jim

*/
public static class MapA extends MapReduceBase implements Mapper<LongWritable, Text, Text, Capital_or_ID>
{


@Override
public void map(LongWritable lineN, Text content, OutputCollector<Text, Capital_or_ID> collect, Reporter rp)
throws IOException
{
// TODO Auto-generated method stub
String part[] = content.toString().split("\\t");
if (part.length == 2)
{
Capital_or_ID coi = new Capital_or_ID(part[0], "file_1");
collect.collect(new Text(part[1]), coi);

}
System.out.println("in MapA: content="+content);
for(String s:part)
{
System.out.println("part[idx]="+s);
}
}


}


/**
* 处理File_2.txt 输入: 行号,行内容<br/>
* 输出: key=国家名,value=首都名

* @author Jim

*/
public static class MapB extends MapReduceBase implements Mapper<LongWritable, Text, Text, Capital_or_ID>
{


@Override
public void map(LongWritable lineN, Text content, OutputCollector<Text, Capital_or_ID> collect, Reporter rp)
throws IOException
{
// TODO Auto-generated method stub
String part[] = content.toString().split("\\t");
if (part.length == 2)
{
Capital_or_ID coi = new Capital_or_ID(part[1], "file_2");
collect.collect(new Text(part[0]), coi);

}
System.out.println("in MapB: content="+content);
for(String s:part)
{
System.out.println("part[idx]="+s);
}
}


}


/**
* Reduce.class处理最后结果,将国家名、编号和首都格式化为:"ID=%s\tcountry=%s\tcapital=%s"

* ID=1 country=China capital=BeiJing

* @author Jim

*/
public static class Reduce extends MapReduceBase implements Reducer<Text, Capital_or_ID, Text, Text>
{


@Override
public void reduce(Text countryName, Iterator<Capital_or_ID> values, OutputCollector<Text, Text> collect,
Reporter rp) throws IOException
{
// TODO Auto-generated method stub
String capitalName = null, ID = null;
while (values.hasNext())
{
Capital_or_ID coi = values.next();
if (coi.getTag().equals("file_1"))
{
ID = coi.getValue();
} else if (coi.getTag().equals("file_2"))
{
capitalName = coi.getValue();
}
}


String result = String.format("ID=%s\tname=%s\tcapital=%s", ID, countryName, capitalName);


collect.collect(countryName, new Text(result));
}
}


public static void main(String args[]) throws IOException
{
// args[0] file1 for MapA
String file_1 = args[0];
// args[1] file2 for MapB
String file_2 = args[1];
// args[2] outPath
String outPath = args[2];


JobConf conf = new JobConf(Example_1.class);
conf.setJobName("example-MultipleInputs");


conf.setMapOutputKeyClass(Text.class);
conf.setMapOutputValueClass(Capital_or_ID.class);

conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(Text.class);


conf.setReducerClass(Reduce.class);


conf.setOutputFormat(TextOutputFormat.class);
FileOutputFormat.setOutputPath(conf, new Path(outPath));


MultipleInputs.addInputPath(conf, new Path(file_1), TextInputFormat.class, MapA.class);
MultipleInputs.addInputPath(conf, new Path(file_2), TextInputFormat.class, MapB.class);


JobClient.runJob(conf);
}
}


用到的类:

package org.forward.example.hadoop.multipleinputs;


import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;


import org.apache.hadoop.io.Writable;
/**
 * @author Jim
 * 该类为自定义数据类型. 其目的是为了在多个输入文件多个Mapper的情况下,标记数据.从而Reduce可以辨认value的来源
 */
public class Capital_or_ID implements Writable
{
/** 相同来源的value应当具有相同的tag */
private String tag=null;

private String value=null;

public Capital_or_ID()
{

}
public Capital_or_ID(String value,String tag)
{
this.value=value;
this.tag=tag;
}
public String getTag()
{
return tag;
}
public void setTag(String tag)
{
this.tag=tag;
}
public String getValue()
{
return value;
}
@Override
public void readFields(DataInput in) throws IOException
{
// TODO Auto-generated method stub
tag=in.readUTF();
value=in.readUTF();
}


@Override
public void write(DataOutput out) throws IOException
{
// TODO Auto-generated method stub
out.writeUTF(tag);
out.writeUTF(value);
}


}


当前pom文件: <?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>org.example</groupId> <artifactId>udBatchProject</artifactId> <version>1.1-SNAPSHOT</version> <properties> <hadoop.scope>provided</hadoop.scope> <hadoop.version>2.7.3</hadoop.version> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <java.version>1.8</java.version> <maven.compiler.source>1.8</maven.compiler.source> <maven.compiler.target>1.8</maven.compiler.target> <maven.compiler.compilerVersion>1.8</maven.compiler.compilerVersion> <buildtype>release</buildtype> <encoding>UTF-8</encoding> <project.build.sourceEncoding>${encoding}</project.build.sourceEncoding> </properties> <dependencies> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-mapreduce-client-common</artifactId> <version>${hadoop.version}</version> <scope>${hadoop.scope}</scope> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-mapreduce-client-jobclient</artifactId> <version>${hadoop.version}</version> <scope>${hadoop.scope}</scope> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-common</artifactId> <version>${hadoop.version}</version> <scope>${hadoop.scope}</scope> <exclusions> <exclusion> <groupId>com.sina</groupId> <artifactId>DRFA</artifactId> </exclusion> </exclusions> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-auth</artifactId> <version>${hadoop.version}</version> <scope>${hadoop.scope}</scope> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-hdfs</artifactId> <version>${hadoop.version}</version> <scope>${hadoop.scope}</scope> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-yarn-client</artifactId> <version>${hadoop.version}</version> <scope>${hadoop.scope}</scope> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-yarn-api</artifactId> <version>${hadoop.version}</version> <scope>${hadoop.scope}</scope> <exclusions> <exclusion> <artifactId>jdk.tools</artifactId> <groupId>jdk.tools</groupId> </exclusion> </exclusions> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-yarn-common</artifactId> <version>${hadoop.version}</version> <scope>${hadoop.scope}</scope> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-mapreduce-client-core</artifactId> <version>${hadoop.version}</version> <scope>${hadoop.scope}</scope> </dependency> <!-- <dependency>--> <!-- <groupId>com.sina.Armyknife</groupId>--> <!-- <artifactId>sina</artifactId>--> <!-- <version>0.5</version>--> <!-- <exclusions>--> <!-- <exclusion>--> <!-- <groupId>com.sina</groupId>--> <!-- <artifactId>DRFA</artifactId>--> <!-- </exclusion>--> <!-- </exclusions>--> <!-- </dependency>--> <dependency> <groupId>org.apache.hive</groupId> <artifactId>hive-exec</artifactId> <version>0.13.0</version> </dependency> <dependency> <groupId>com.googlecode.aviator</groupId> <artifactId>aviator</artifactId> <version>3.0.0</version> <scope>provided</scope> </dependency> <dependency> <groupId>com.alibaba</groupId> <artifactId>fastjson</artifactId> <version>1.2.76</version> </dependency> <dependency> <groupId>com.google.code.gson</groupId> <artifactId>gson</artifactId> <version>2.8.9</version> </dependency> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.13.1</version> </dependency> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>8.0.29</version> </dependency> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-context</artifactId> <version>5.3.15</version> </dependency> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-beans</artifactId> <version>5.3.15</version> </dependency> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-jdbc</artifactId> <version>5.3.15</version> </dependency> </dependencies> <repositories> <repository> <id>thirdparty</id> <name>3rd party</name> <url>http://10.39.0.110:8081/nexus/content/repositories/thirdparty</url> <releases> <enabled>true</enabled> </releases> <snapshots> <enabled>false</enabled> </snapshots> </repository> <repository> <id>central</id> <name>Central</name> <url>http://10.39.0.110:8081/nexus/content/repositories/central</url> <releases> <enabled>true</enabled> </releases> <snapshots> <enabled>false</enabled> </snapshots> </repository> </repositories> <pluginRepositories> <pluginRepository> <releases> <updatePolicy>never</updatePolicy> </releases> <snapshots> <enabled>false</enabled> </snapshots> <id>public</id> <name>Public Repositories</name> <url>http://10.39.0.110:8081/nexus/content/groups/public/</url> </pluginRepository> </pluginRepositories> <build> <plugins> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-assembly-plugin</artifactId> <version>3.1.0</version> <configuration> <descriptorRefs> <descriptorRef>jar-with-dependencies</descriptorRef> </descriptorRefs> </configuration> <executions> <execution> <phase>package</phase> <goals> <goal>single</goal> </goals> </execution> </executions> </plugin> </plugins> </build> </project> 在引入依赖 import com.sina.hadoop.MultipleInputs;的时候报错
最新发布
07-11
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值