五、Hadoop+HBase实例

这是一个使用Hadoop和HBase进行日志分析的实例。程序包括了AccessLogMapper、AccessLogReducer和LogAnalysiser三个部分,分别处理输入数据,将结果写入HBase表。Mapper对每个日志行进行拆分,Reducer将相同键的数据聚合并写入HBase。程序运行成功,输出结果存储在c:/output目录中。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

写个例子玩玩,意图是将c:/input目录下的1.txt,2.txt,3.txt中的内容用传说中的mapreduce来map一下,再reduce一下。在c:/output下的part-r-00000文件果真是按字典序给排好了哈。

 

package com.taobao.app;

import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class AccessLogMapper extends Mapper<Object, Text, Text, Text> {
  public void map(Object key, Text value, Context context) throws IOException {
  String temp[] = value.toString().split(" ");
  String k = temp[0];
  String v = temp[1];
  System.out.println("mapper:  key " + k + "value " + v);
  try {
   context.write(new Text(k), new Text(v));
  } catch (InterruptedException e) {

  }
 }
}


package com.taobao.app;

import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Text;

public class AccessLogReducer extends TableReducer<Text, Text, ImmutableBytesWritable > {
 public void reduce(Text arg0, Iterable<Text>  arg1, Context context)  {
      
  String k =arg0.toString();
      String v =arg1.iterator().next().toString();
      
        ImmutableBytesWritable key = new ImmutableBytesWritable(Bytes.toBytes(k));
        Put put = new Put(Bytes.toBytes(k));
        put.add(Bytes.toBytes("f1"), Bytes.toBytes("qualifier"), Bytes .toBytes(v));
       
        System.out.println("reduce key:"+k+ "  value:"+  v );
       
        try {
            context.write(key, put);
        } catch (Exception e) {
          
        }
    }
}


package com.taobao.app;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;

public class LogAnalysiser extends Configured implements Tool {
 public int run(String[] args) throws Exception{
  Configuration conf=new Configuration();
  conf.set("hbase.rootdir", "hdfs://my031068.sqa.cm4.tbsite.net:9000/hbase");
        conf.set("hbase.zookeeper.quorum", "10.232.31.68");
        conf.set("hbase.zookeeper.property.clientPort", "2222");
      
        Job job = new Job(conf, "Sample MR Application");
        job.setJarByClass(AccessLogSampleApplication.class);
        job.setInputFormatClass(TextInputFormat.class);
        job.setMapperClass(AccessLogMapper.class);
        job.setReducerClass(AccessLogReducer.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        FileInputFormat.addInputPath(job, new Path("c:/input"));
        FileOutputFormat.setOutputPath(job, new Path("c:/output"));
       
        job.waitForCompletion(true);
        return job.waitForCompletion(true) ? 0 : 1;
       
    }
}


package com.taobao.app;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.ToolRunner;

public class AccessLogSampleApplication {
 public static void main(String[] args) throws Exception {
  
        int m_rc = 0;
        m_rc = ToolRunner.run(new Configuration(), new LogAnalysiser(), args);
        System.exit(m_rc);
    }
}

1.txt的内容如下:
1 name1--txt1-www.javabloger.com
2 name2--txt1
3 name3--txt1
4 name4--txt1
5 name5--txt1

2.txt的内容如下:
6 name6--txt2-www.javabloger.com
7 name7--txt2
8 name8--txt2
9 name9--txt2
10 name10--txt2
 
3.txt的内容如下:
11 name11--txt3-www.javabloger.com
12 name12--txt3
13 name13--txt3
14 name14--txt3
15 name15--txt3


下面是console上打的信息:

11/07/30 22:07:31 INFO jvm.JvmMetrics: Initializing JVM Metrics with processName=JobTracker, sessionId=
11/07/30 22:07:31 WARN mapred.JobClient: Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same.
11/07/30 22:07:32 WARN mapred.JobClient: No job jar file set.  User classes may not be found. See JobConf(Class) or JobConf#setJar(String).
11/07/30 22:07:32 INFO input.FileInputFormat: Total input paths to process : 3
11/07/30 22:07:32 INFO mapred.JobClient: Running job: job_local_0001
11/07/30 22:07:32 INFO input.FileInputFormat: Total input paths to process : 3
11/07/30 22:07:32 INFO mapred.MapTask: io.sort.mb = 100
11/07/30 22:07:32 INFO mapred.MapTask: data buffer = 79691776/99614720
11/07/30 22:07:32 INFO mapred.MapTask: record buffer = 262144/327680
mapper:  key 1value name1--txt1-www.javabloger.com
11/07/30 22:07:32 INFO mapred.MapTask: Starting flush of map output
mapper:  key 2value name2--txt1
mapper:  key 3value name3--txt1
mapper:  key 4value name4--txt1
mapper:  key 5value name5--txt1
11/07/30 22:07:32 INFO mapred.MapTask: Finished spill 0
11/07/30 22:07:32 INFO mapred.TaskRunner: Task:attempt_local_0001_m_000000_0 is done. And is in the process of commiting
11/07/30 22:07:32 INFO mapred.LocalJobRunner:
11/07/30 22:07:32 INFO mapred.TaskRunner: Task 'attempt_local_0001_m_000000_0' done.
11/07/30 22:07:32 INFO mapred.MapTask: io.sort.mb = 100
11/07/30 22:07:32 INFO mapred.MapTask: data buffer = 79691776/99614720
11/07/30 22:07:32 INFO mapred.MapTask: record buffer = 262144/327680
mapper:  key 6value name6--txt2-www.javabloger.com
mapper:  key 7value name7--txt2
mapper:  key 8value name8--txt2
mapper:  key 9value name9--txt2
mapper:  key 10value name10--txt2
11/07/30 22:07:32 INFO mapred.MapTask: Starting flush of map output
11/07/30 22:07:32 INFO mapred.MapTask: Finished spill 0
11/07/30 22:07:32 INFO mapred.TaskRunner: Task:attempt_local_0001_m_000001_0 is done. And is in the process of commiting
11/07/30 22:07:32 INFO mapred.LocalJobRunner:
11/07/30 22:07:32 INFO mapred.TaskRunner: Task 'attempt_local_0001_m_000001_0' done.
11/07/30 22:07:32 INFO mapred.MapTask: io.sort.mb = 100
11/07/30 22:07:32 INFO mapred.MapTask: data buffer = 79691776/99614720
11/07/30 22:07:32 INFO mapred.MapTask: record buffer = 262144/327680
mapper:  key 11value name11--txt3-www.javabloger.com
mapper:  key 12value name12--txt3
mapper:  key 13value name13--txt3
mapper:  key 14value name14--txt3
mapper:  key 15value name15--txt3
11/07/30 22:07:32 INFO mapred.MapTask: Starting flush of map output
11/07/30 22:07:32 INFO mapred.MapTask: Finished spill 0
11/07/30 22:07:32 INFO mapred.TaskRunner: Task:attempt_local_0001_m_000002_0 is done. And is in the process of commiting
11/07/30 22:07:32 INFO mapred.LocalJobRunner:
11/07/30 22:07:32 INFO mapred.TaskRunner: Task 'attempt_local_0001_m_000002_0' done.
11/07/30 22:07:32 INFO mapred.LocalJobRunner:
11/07/30 22:07:32 INFO mapred.Merger: Merging 3 sorted segments
11/07/30 22:07:32 INFO mapred.Merger: Down to the last merge-pass, with 3 segments left of total size: 315 bytes
11/07/30 22:07:32 INFO mapred.LocalJobRunner:
reduce key:1  value:name1--txt1-www.javabloger.com
reduce key:10  value:name10--txt2
reduce key:11  value:name11--txt3-www.javabloger.com
reduce key:12  value:name12--txt3
reduce key:13  value:name13--txt3
reduce key:14  value:name14--txt3
reduce key:15  value:name15--txt3
reduce key:2  value:name2--txt1
reduce key:3  value:name3--txt1
reduce key:4  value:name4--txt1
reduce key:5  value:name5--txt1
reduce key:6  value:name6--txt2-www.javabloger.com
reduce key:7  value:name7--txt2
reduce key:8  value:name8--txt2
reduce key:9  value:name9--txt2

输出到c:/output目录下,part-t-00000内容如下:
31 row=1, families={(family=f1, keyvalues=(1/f1:qualifier/9223372036854775807/Put/vlen=30)}
31 30 row=10, families={(family=f1, keyvalues=(10/f1:qualifier/9223372036854775807/Put/vlen=12)}
31 31 row=11, families={(family=f1, keyvalues=(11/f1:qualifier/9223372036854775807/Put/vlen=31)}
31 32 row=12, families={(family=f1, keyvalues=(12/f1:qualifier/9223372036854775807/Put/vlen=12)}
31 33 row=13, families={(family=f1, keyvalues=(13/f1:qualifier/9223372036854775807/Put/vlen=12)}
31 34 row=14, families={(family=f1, keyvalues=(14/f1:qualifier/9223372036854775807/Put/vlen=12)}
31 35 row=15, families={(family=f1, keyvalues=(15/f1:qualifier/9223372036854775807/Put/vlen=12)}
32 row=2, families={(family=f1, keyvalues=(2/f1:qualifier/9223372036854775807/Put/vlen=11)}
33 row=3, families={(family=f1, keyvalues=(3/f1:qualifier/9223372036854775807/Put/vlen=11)}
34 row=4, families={(family=f1, keyvalues=(4/f1:qualifier/9223372036854775807/Put/vlen=11)}
35 row=5, families={(family=f1, keyvalues=(5/f1:qualifier/9223372036854775807/Put/vlen=11)}
36 row=6, families={(family=f1, keyvalues=(6/f1:qualifier/9223372036854775807/Put/vlen=30)}
37 row=7, families={(family=f1, keyvalues=(7/f1:qualifier/9223372036854775807/Put/vlen=11)}
38 row=8, families={(family=f1, keyvalues=(8/f1:qualifier/9223372036854775807/Put/vlen=11)}
39 row=9, families={(family=f1, keyvalues=(9/f1:qualifier/9223372036854775807/Put/vlen=11)}
11/07/30 22:07:32 INFO mapred.TaskRunner: Task:attempt_local_0001_r_000000_0 is done. And is in the process of commiting
11/07/30 22:07:32 INFO mapred.LocalJobRunner:
11/07/30 22:07:32 INFO mapred.TaskRunner: Task attempt_local_0001_r_000000_0 is allowed to commit now
11/07/30 22:07:33 INFO output.FileOutputCommitter: Saved output of task 'attempt_local_0001_r_000000_0' to c:/output
11/07/30 22:07:33 INFO mapred.LocalJobRunner: reduce > reduce
11/07/30 22:07:33 INFO mapred.TaskRunner: Task 'attempt_local_0001_r_000000_0' done.
11/07/30 22:07:33 INFO mapred.JobClient:  map 100% reduce 100%
11/07/30 22:07:33 INFO mapred.JobClient: Job complete: job_local_0001
11/07/30 22:07:33 INFO mapred.JobClient: Counters: 12
11/07/30 22:07:33 INFO mapred.JobClient:   FileSystemCounters
11/07/30 22:07:33 INFO mapred.JobClient:     FILE_BYTES_READ=55525
11/07/30 22:07:33 INFO mapred.JobClient:     FILE_BYTES_WRITTEN=113650
11/07/30 22:07:33 INFO mapred.JobClient:   Map-Reduce Framework
11/07/30 22:07:33 INFO mapred.JobClient:     Reduce input groups=15
11/07/30 22:07:33 INFO mapred.JobClient:     Combine output records=0
11/07/30 22:07:33 INFO mapred.JobClient:     Map input records=15
11/07/30 22:07:33 INFO mapred.JobClient:     Reduce shuffle bytes=0
11/07/30 22:07:33 INFO mapred.JobClient:     Reduce output records=15
11/07/30 22:07:33 INFO mapred.JobClient:     Spilled Records=30
11/07/30 22:07:33 INFO mapred.JobClient:     Map output bytes=279
11/07/30 22:07:33 INFO mapred.JobClient:     Combine input records=0
11/07/30 22:07:33 INFO mapred.JobClient:     Map output records=15
11/07/30 22:07:33 INFO mapred.JobClient:     Reduce input records=15
11/07/30 22:07:33 INFO mapred.JobClient: Running job: job_local_0001
11/07/30 22:07:33 INFO mapred.JobClient: Job complete: job_local_0001
11/07/30 22:07:33 INFO mapred.JobClient: Counters: 12
11/07/30 22:07:33 INFO mapred.JobClient:   FileSystemCounters
11/07/30 22:07:33 INFO mapred.JobClient:     FILE_BYTES_READ=55525
11/07/30 22:07:33 INFO mapred.JobClient:     FILE_BYTES_WRITTEN=113650
11/07/30 22:07:33 INFO mapred.JobClient:   Map-Reduce Framework
11/07/30 22:07:33 INFO mapred.JobClient:     Reduce input groups=15
11/07/30 22:07:33 INFO mapred.JobClient:     Combine output records=0
11/07/30 22:07:33 INFO mapred.JobClient:     Map input records=15
11/07/30 22:07:33 INFO mapred.JobClient:     Reduce shuffle bytes=0
11/07/30 22:07:33 INFO mapred.JobClient:     Reduce output records=15
11/07/30 22:07:33 INFO mapred.JobClient:     Spilled Records=30
11/07/30 22:07:33 INFO mapred.JobClient:     Map output bytes=279
11/07/30 22:07:33 INFO mapred.JobClient:     Combine input records=0
11/07/30 22:07:33 INFO mapred.JobClient:     Map output records=15
11/07/30 22:07:33 INFO mapred.JobClient:     Reduce input records=15

### Java HBase 教程示例 #### 创建配置并连接至 HBase 集群 为了能够使用 Java 对 HBase 进行操作,首先需要建立与 HBase 的连接。这可以通过 `Configuration` 类完成,该类允许设置必要的参数以便定位集群。 ```java import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.client.Connection; import org.apache.hadoop.hbase.client.ConnectionFactory; public class HBaseConnectionExample { public static Connection connect() throws Exception { Configuration config = HBaseConfiguration.create(); // 设置 ZooKeeper 地址和其他必要属性 config.set("hbase.zookeeper.quorum", "localhost"); return ConnectionFactory.createConnection(config); } } ``` 此代码片段展示了如何初始化一个到本地运行的 HBase 实例的连接[^1]。 #### 表管理:创建新表 一旦建立了连接,则可以开始定义和管理表格结构。下面的例子说明了怎样创建一个新的带有列族 'cf' 的表: ```java import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.Admin; import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder; import org.apache.hadoop.hbase.client.TableDescriptorBuilder; // ... (其他导入语句) public void createTable(Connection connection, String tableName) throws IOException { try (Admin admin = connection.getAdmin()) { if (!admin.tableExists(TableName.valueOf(tableName))) { TableDescriptorBuilder tableDesc = TableDescriptorBuilder.newBuilder( TableName.valueOf(tableName)); ColumnFamilyDescriptorBuilder columnFamily = ColumnFamilyDescriptorBuilder.newBuilder(Bytes.toBytes("cf")).build(); tableDesc.setColumnFamilies(Arrays.asList(columnFamily)); admin.createTable(tableDesc.build()); } else { System.out.println("Table already exists."); } } } ``` 这段程序检查指定名称的表是否存在;如果不存在则会创建之[^2]。 #### 数据写入:向表中插入记录 对于数据持久化来说,最常用的方法之一就是利用 Put 请求将键值对存入特定位置。这里给出了一种简单的方式来进行单条目添加: ```java import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.client.Table; // ... (其他导入语句) public void insertData(Connection connection, String tableName, String rowKey, Map<String, String> keyValuePairs) throws IOException { try (Table table = connection.getTable(TableName.valueOf(tableName))) { Put putRecord = new Put(Bytes.toBytes(rowKey)); for (Map.Entry<String, String> entry : keyValuePairs.entrySet()) { byte[] qualifier = Bytes.toBytes(entry.getKey()); byte[] value = Bytes.toBytes(entry.getValue()); putRecord.addColumn(Bytes.toBytes("cf"), qualifier, value); } table.put(putRecord); } } ``` 上述函数接收一系列字段及其对应的值作为输入,并将其关联到给定的行键上[^3]。 #### 查询检索:获取某一行的数据项 当想要读取之前保存的信息时,Get 命令提供了便捷途径来访问具体单元格的内容: ```java import org.apache.hadoop.hbase.client.Get; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.util.Bytes; // ... (其他导入语句) public Map<byte[], byte[]> getData(Connection connection, String tableName, String rowKey) throws IOException { try (Table table = connection.getTable(TableName.valueOf(tableName))) { Get getRow = new Get(Bytes.toBytes(rowKey)); Result result = table.get(getRow); NavigableMap<byte[], byte[]> map = result.getNoVersionMap(); return map; } } ``` 这个方法返回由字节数组组成的映射对象,其中包含了所请求行内所有可用的列名及相应数值[^4]。
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值