数据完整性
客户端从datanode读取数据时,会验证校验和
每个datanode也会在后台线程中运行一个DataBlockScanner,从而定期验证存储在这个datanode中的所有数据块
由于Hdfs存储每个数据块的复本,可以通过复本来修复受损的数据块
压缩
压缩两大好处:
1.减少存储文件所需要的磁盘空间
2.加速数据在网络和磁盘上的传输
通常使用gzip进行压缩
bzip2相对gzip压缩速度慢,但是支持切分
codec 是压缩-解压缩算法的一种实现
1.通过CompressionCodec对数据进行压缩和解压缩
String uri = "hdfs://hadoop:9000/input/createFileGz1.gz";
String localUri = "F:/NL/hadoop/input/createFile1.txt";
String codecClassname = "org.apache.hadoop.io.compress.GzipCodec";
Class<?> codecClass = Class.forName(codecClassname);
Configuration conf = new Configuration();
CompressionCodec codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
FileSystem fs = FileSystem.get(URI.create(uri), conf);
CompressionOutputStream out = codec.createOutputStream(fs.create(new Path(uri)));
InputStream in = new BufferedInputStream(new FileInputStream(localUri));
IOUtils.copyBytes(in, out, 4096, false);
out.finish();
2.通过CompressionCodecFactory 推断CompressionCodec,并解压文件
package org.apache.hadoop.examples.io;
// cc FileDecompressor A program to decompress a compressed file using a codec inferred from the file's extension
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
// vv FileDecompressor
public class FileDecompressor {
public static void main(String[] args) throws Exception {
// String uri = args[0];
String uri = "hdfs://hadoop:9000/input/createFileGz1.gz";
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(URI.create(uri), conf);
Path inputPath = new Path(uri);
CompressionCodecFactory factory = new CompressionCodecFactory(conf);
CompressionCodec codec = factory.getCodec(inputPath);
if (codec == null) {
System.err.println("No codec found for " + uri);
System.exit(1);
}
String outputUri =
CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());
InputStream in = null;
OutputStream out = null;
try {
in = codec.createInputStream(fs.open(inputPath));
out = fs.create(new Path(outputUri));
IOUtils.copyBytes(in, out, conf);
} finally {
IOUtils.closeStream(in);
IOUtils.closeStream(out);
}
}
}
// ^^ FileDecompressor
CodecPool
如果使用原生代码库,并且需要在应用中执行大量压缩和解压缩操作,考虑使用CodePool,它支持反复使用压缩和解压缩,
以分摊创建这些对象的开销
public class PooledStreamCompressor {
public static void main(String[] args) throws Exception {
String codecClassname = args[0];
Class<?> codecClass = Class.forName(codecClassname);
Configuration conf = new Configuration();
CompressionCodec codec = (CompressionCodec)
ReflectionUtils.newInstance(codecClass, conf);
/*[*/Compressor compressor = null;
try {
compressor = CodecPool.getCompressor(codec);/*]*/
CompressionOutputStream out =
codec.createOutputStream(System.out, /*[*/compressor/*]*/);
IOUtils.copyBytes(System.in, out, 4096, false);
out.finish();
/*[*/} finally {
CodecPool.returnCompressor(compressor);
}/*]*/
}
}
压缩和输入分片
FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
压缩效率从高到底:
使用容器文件格式,如:顺序文件,Avro数据文件,Parquet等
bzip2 支持压缩分片(速度相对慢)或使用 LZO 通过索引实现切分的压缩格式
对于大文件来说,不要使用不支持切分整个文件的压缩格式,因为会失去数据的本地行,造成MapReduce效率低下
如果MapReduce驱动使用Tool借口,则可以通过命令行将属性传递给程序
对map任务输出进行压缩:由于map任务输出需要些到磁盘通过网络传输到reduce节点,所以通过LZO,LZ4,Snappy
这样的快速要锁方式,可以获得性能上的提升,因为输出减少了
Configuration conf = new Configuration();
conf.setBoolean(Job.MAP_OUTPUT_COMPRESS, true);
conf.setClass(Job.MAP_OUTPUT_COMPRESS_CODEC, GzipCodec.class,CompressionCodec.class);、
序列化
序列化:指结构化对象转化为字节流一遍在网络上传输或些到磁盘进行永久保存
反序列化:指将字节流传回结构化对象的逆过程
序列化用于分布式数据处理的两大领域:进程间通信,永久存储
通常节点上进行通信,通过 远程过程调用(RPC)实现的,将协议消息序列化成二进制流发送到远程节点,远程节点接着讲二进制刘反序列化为原始消息
Hadoop使用自己的序列化格式Writable
writable定义了两个接口:
public abstract interface Writable {
public abstract void write(DataOutput paramDataOutput) throws IOException;
public abstract void readFields(DataInput paramDataInput) throws IOException;
}
一个完整wirteable类说明:public class LongWritable implements WritableComparable<LongWritable> {
private long value;
public LongWritable() {
}
public LongWritable(long value) {
set(value);
}
public void set(long value) {
this.value = value;
}
public long get() {
return this.value;
}
public void readFields(DataInput in) throws IOException {
this.value = in.readLong();
}
public void write(DataOutput out) throws IOException {
out.writeLong(this.value);
}
public boolean equals(Object o) {
if (!(o instanceof LongWritable))
return false;
LongWritable other = (LongWritable) o;
return (this.value == other.value);
}
public int hashCode() {
return (int) this.value;
}
public int compareTo(LongWritable o) {
long thisValue = this.value;
long thatValue = o.value;
return ((thisValue == thatValue) ? 0 : (thisValue < thatValue) ? -1 : 1);
}
public String toString() {
return Long.toString(this.value);
}
static {
WritableComparator.define(LongWritable.class, new Comparator());
}
public static class DecreasingComparator extends LongWritable.Comparator {
public int compare(WritableComparable a, WritableComparable b) {
return super.compare(b, a);
}
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
return super.compare(b2, s2, l2, b1, s1, l1);
}
}
public static class Comparator extends WritableComparator {
public Comparator() {
super(LongWritable.class);
}
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
long thisValue = readLong(b1, s1);
long thatValue = readLong(b2, s2);
return ((thisValue == thatValue) ? 0 : (thisValue < thatValue) ? -1 : 1);
}
}
}
public abstract interface WritableComparable<T> extends Writable, Comparable<T> {
}
public interface Comparable<T> {
int compareTo(T arg0);
}
Hadoop提供一个优化接口,集成自Java Comparator 的 RawComparator接口: public abstract interface RawComparator<T> extends Comparator<T> {
public abstract int compare(byte[] paramArrayOfByte1, int paramInt1, int paramInt2, byte[] paramArrayOfByte2,
int paramInt3, int paramInt4);
}
RawComparator<IntWritable> cpmparator = WritableComparator.get(IntWritable.class);
IntWritable w1 = new IntWritable(163);
IntWritable w2 = new IntWritable(63);
comparator.compare(w1, w2)
特殊基本类型:
VIntWritable ,VLongWritable 可变长格式,会更节省空间,不必一开始就用8个字符
Text类型:UTF-8序列的 Writable类型,一般认为与 String 等价,最大值2GB
charAt()返回的表示一个Unicode编码位置的 int 数值
而String 返回一个 char 类型值
Text find() 类似 String indexOf()
Text并不能像String 有丰富API ,通常转换为String操作:new Text("AAA").toString()
BytesWritable:可变的,可通过set()修改
NullWritable:长度0,充当占位符,高效存储常量空值
ObjectWritable:通用封装
writable集合类:
ArrayWritable
TwoDArrayWritable:二维数组
必须同一类型实例
ArrayWritable writable = new ArrayWritable(Text.class);
// ^^ ArrayWritableTest
writable.set(new Text[] { new Text("cat"), new Text("dog") });
TextArrayWritable dest = new TextArrayWritable();
WritableUtils.cloneInto(dest, writable);
assertThat(dest.get().length, is(2));
// TODO: fix cast, also use single assert
assertThat((Text) dest.get()[0], is(new Text("cat")));
assertThat((Text) dest.get()[1], is(new Text("dog")));
Text[] copy = (Text[]) dest.toArray();
assertThat(copy[0], is(new Text("cat")));
assertThat(copy[1], is(new Text("dog")));
实现定制Writable集合public class TextPair implements WritableComparable<TextPair> {
private Text first;
private Text second;
public TextPair() {
set(new Text(), new Text());
}
public TextPair(String first, String second) {
set(new Text(first), new Text(second));
}
public TextPair(Text first, Text second) {
set(first, second);
}
public void set(Text first, Text second) {
this.first = first;
this.second = second;
}
public Text getFirst() {
return first;
}
public Text getSecond() {
return second;
}
@Override
public void write(DataOutput out) throws IOException {
first.write(out);
second.write(out);
}
@Override
public void readFields(DataInput in) throws IOException {
first.readFields(in);
second.readFields(in);
}
@Override
public int hashCode() {
return first.hashCode() * 163 + second.hashCode();
}
@Override
public boolean equals(Object o) {
if (o instanceof TextPair) {
TextPair tp = (TextPair) o;
return first.equals(tp.first) && second.equals(tp.second);
}
return false;
}
@Override
public String toString() {
return first + "\t" + second;
}
@Override
public int compareTo(TextPair tp) {
int cmp = first.compareTo(tp.first);
if (cmp != 0) {
return cmp;
}
return second.compareTo(tp.second);
}
// ^^ TextPair
// vv TextPairComparator
public static class Comparator extends WritableComparator {
private static final Text.Comparator TEXT_COMPARATOR = new Text.Comparator();
public Comparator() {
super(TextPair.class);
}
@Override
public int compare(byte[] b1, int s1, int l1,
byte[] b2, int s2, int l2) {
try {
int firstL1 = WritableUtils.decodeVIntSize(b1[s1]) + readVInt(b1, s1);
int firstL2 = WritableUtils.decodeVIntSize(b2[s2]) + readVInt(b2, s2);
int cmp = TEXT_COMPARATOR.compare(b1, s1, firstL1, b2, s2, firstL2);
if (cmp != 0) {
return cmp;
}
return TEXT_COMPARATOR.compare(b1, s1 + firstL1, l1 - firstL1,
b2, s2 + firstL2, l2 - firstL2);
} catch (IOException e) {
throw new IllegalArgumentException(e);
}
}
}
static {
WritableComparator.define(TextPair.class, new Comparator());
}
// ^^ TextPairComparator
// vv TextPairFirstComparator
public static class FirstComparator extends WritableComparator {
private static final Text.Comparator TEXT_COMPARATOR = new Text.Comparator();
public FirstComparator() {
super(TextPair.class);
}
@Override
public int compare(byte[] b1, int s1, int l1,
byte[] b2, int s2, int l2) {
try {
int firstL1 = WritableUtils.decodeVIntSize(b1[s1]) + readVInt(b1, s1);
int firstL2 = WritableUtils.decodeVIntSize(b2[s2]) + readVInt(b2, s2);
return TEXT_COMPARATOR.compare(b1, s1, firstL1, b2, s2, firstL2);
} catch (IOException e) {
throw new IllegalArgumentException(e);
}
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
if (a instanceof TextPair && b instanceof TextPair) {
return ((TextPair) a).first.compareTo(((TextPair) b).first);
}
return super.compare(a, b);
}
}
// ^^ TextPairFirstComparator
// vv TextPair
}
基于文件的数据结构
对于基于MapReduce的数据处理,将每个二进制数据大对象(blob)单独放在各自的文件中不能实现可扩展性,所以,hadoop为此开发了很多高层次的容器
SequenceFile
考虑日志文件,其中每一行代表一条日志记录,纯文本不适合记录二进制类数据类型
sequenceFIle 写操作
public class SequenceFileWriteDemo {
private static final String[] DATA = {
"One, two, buckle my shoe",
"Three, four, shut the door",
"Five, six, pick up sticks",
"Seven, eight, lay them straight",
"Nine, ten, a big fat hen"
};
public static void main(String[] args) throws IOException {
String uri = args[0];
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(URI.create(uri), conf);
Path path = new Path(uri);
IntWritable key = new IntWritable();
Text value = new Text();
SequenceFile.Writer writer = null;
try {
writer = SequenceFile.createWriter(fs, conf, path,
key.getClass(), value.getClass());
for (int i = 0; i < 100; i++) {
key.set(100 - i);
value.set(DATA[i % DATA.length]);
System.out.printf("[%s]\t%s\t%s\n", writer.getLength(), key, value);
writer.append(key, value);
}
} finally {
IOUtils.closeStream(writer);
}
}
}
SquenceFIle 读操作
public class SequenceFileReadDemo {
public static void main(String[] args) throws IOException {
String uri = args[0];
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(URI.create(uri), conf);
Path path = new Path(uri);
SequenceFile.Reader reader = null;
try {
reader = new SequenceFile.Reader(fs, path, conf);
Writable key = (Writable)
ReflectionUtils.newInstance(reader.getKeyClass(), conf);
Writable value = (Writable)
ReflectionUtils.newInstance(reader.getValueClass(), conf);
long position = reader.getPosition();
while (reader.next(key, value)) {
String syncSeen = reader.syncSeen() ? "*" : "";
System.out.printf("[%s%s]\t%s\t%s\n", position, syncSeen, key, value);
position = reader.getPosition(); // beginning of next record
}
} finally {
IOUtils.closeStream(reader);
}
}
}
顺序文件中给定索引位置有两种方法: 1.调用seek(),如果不是记录边界就会报错
2.sync(),可以指定读取位置到position之后的下一个同步点
public class SequenceFileSeekAndSyncTest {
private static final String SF_URI = "test.numbers.seq";
private FileSystem fs;
private SequenceFile.Reader reader;
private Writable key;
private Writable value;
@Before
public void setUp() throws IOException {
SequenceFileWriteDemo.main(new String[] { SF_URI });
Configuration conf = new Configuration();
fs = FileSystem.get(URI.create(SF_URI), conf);
Path path = new Path(SF_URI);
reader = new SequenceFile.Reader(fs, path, conf);
key = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
value = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), conf);
}
@After
public void tearDown() throws IOException {
fs.delete(new Path(SF_URI), true);
}
@Test
public void seekToRecordBoundary() throws IOException {
// vv SequenceFileSeekAndSyncTest
reader.seek(359);
assertThat(reader.next(key, value), is(true));
assertThat(((IntWritable) key).get(), is(95));
// ^^ SequenceFileSeekAndSyncTest
}
@Test(expected=IOException.class)
public void seekToNonRecordBoundary() throws IOException {
// vv SequenceFileSeekAndSyncTest-SeekNonRecordBoundary
reader.seek(360);
reader.next(key, value); // fails with IOException
// ^^ SequenceFileSeekAndSyncTest-SeekNonRecordBoundary
}
@Test
public void syncFromNonRecordBoundary() throws IOException {
// vv SequenceFileSeekAndSyncTest-SyncNonRecordBoundary
reader.sync(360);
assertThat(reader.getPosition(), is(2021L));
assertThat(reader.next(key, value), is(true));
assertThat(((IntWritable) key).get(), is(59));
// ^^ SequenceFileSeekAndSyncTest-SyncNonRecordBoundary
}
@Test
public void syncAfterLastSyncPoint() throws IOException {
reader.sync(4557);
assertThat(reader.getPosition(), is(4788L));
assertThat(reader.next(key, value), is(false));
}
}
SequenceFile 排序和合并
MapReduce是对多个顺序文件进行排序或合并最有效的方法,本身是并行的,可以指定要是用多少个reduce
该数决定输出的分区数
hadoop jar $HADOOP_INSTALL/hadoop-*-examples.jar sort -r 1 \
-inFormat org.apache.hadoop.mapred.SequenceFileInputFormat \
-outFormat org.apache.hadoop.mapred.SequenceFileOutputFormat \
-outKey org.apache.hadoop.io.IntWritable \
-outValue org.apache.hadoop.io.Text \
numbers.seq sorted
另一种方法是使用SequenceFIle.Sorter类中的,sort(),merge(),他们比MapReduce更底层,需要手动对数据进行分区,所以对顺序文件进行排序合并时,采用MapReduce是最佳选择
SequenceFIle格式
顺序文件由文件头和随后的一条或多条记录组成,顺序文件的前三个字节为SEQ(顺序文件代码),紧随其后的
一个子杰表示顺序文件中的版本号
记录的内部取决于是否启用压缩,推荐使用块压缩
MapFile
MapFile是已经排序过的SequenceFile,他有索引,所以可以按键查找,索引自身就是一个SequenceFIle,由于
索引能够加载进内存,可以实现快速查找
MapFile 提供了一个用于读写的,与SequenceFile 非常类似的接口,当使用MapFile.Writer 进行写操作时,map条目
必须顺序添加,否则会抛出异常
public class MapFileWriteDemo {
private static final String[] DATA = {
"One, two, buckle my shoe",
"Three, four, shut the door",
"Five, six, pick up sticks",
"Seven, eight, lay them straight",
"Nine, ten, a big fat hen"
};
public static void main(String[] args) throws IOException {
String uri = args[0];
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(URI.create(uri), conf);
IntWritable key = new IntWritable();
Text value = new Text();
MapFile.Writer writer = null;
try {
writer = new MapFile.Writer(conf, fs, uri,
key.getClass(), value.getClass());
for (int i = 0; i < 1024; i++) {
key.set(i + 1);
value.set(DATA[i % DATA.length]);
writer.append(key, value);
}
} finally {
IOUtils.closeStream(writer);
}
}
}
MapFile变种:SetFile,ArrayFIle,BloomMapFile其他文件格式和面向列格式
顺序文件和map文件是hadoop最早的,并不是仅有的二进制文件格式,对于新项目而言,有更好的二进制文件格式可选择
Avro数据文件
再某些方面类似数据文件,是面向大规模数据处理而设计的(紧凑可切分),可移植,跨越编程语言顺序文件,map文件,Avro数据文件都是面向行的格式,每一行的值在文件中都是连续的
public class AvroGenericMaxTemperature extends Configured implements Tool {
private static final Schema SCHEMA = new Schema.Parser().parse(
"{" +
" \"type\": \"record\"," +
" \"name\": \"WeatherRecord\"," +
" \"doc\": \"A weather reading.\"," +
" \"fields\": [" +
" {\"name\": \"year\", \"type\": \"int\"}," +
" {\"name\": \"temperature\", \"type\": \"int\"}," +
" {\"name\": \"stationId\", \"type\": \"string\"}" +
" ]" +
"}"
);
public static class MaxTemperatureMapper
extends Mapper<LongWritable, Text, AvroKey<Integer>,
AvroValue<GenericRecord>> {
private NcdcRecordParser parser = new NcdcRecordParser();
private GenericRecord record = new GenericData.Record(SCHEMA);
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
parser.parse(value.toString());
if (parser.isValidTemperature()) {
record.put("year", parser.getYearInt());
record.put("temperature", parser.getAirTemperature());
record.put("stationId", parser.getStationId());
context.write(new AvroKey<Integer>(parser.getYearInt()),
new AvroValue<GenericRecord>(record));
}
}
}
public static class MaxTemperatureReducer
extends Reducer<AvroKey<Integer>, AvroValue<GenericRecord>,
AvroKey<GenericRecord>, NullWritable> {
@Override
protected void reduce(AvroKey<Integer> key, Iterable<AvroValue<GenericRecord>>
values, Context context) throws IOException, InterruptedException {
GenericRecord max = null;
for (AvroValue<GenericRecord> value : values) {
GenericRecord record = value.datum();
if (max == null ||
(Integer) record.get("temperature") > (Integer) max.get("temperature")) {
max = newWeatherRecord(record);
}
}
context.write(new AvroKey(max), NullWritable.get());
}
private GenericRecord newWeatherRecord(GenericRecord value) {
GenericRecord record = new GenericData.Record(SCHEMA);
record.put("year", value.get("year"));
record.put("temperature", value.get("temperature"));
record.put("stationId", value.get("stationId"));
return record;
}
}
@Override
public int run(String[] args) throws Exception {
if (args.length != 2) {
System.err.printf("Usage: %s [generic options] <input> <output>\n",
getClass().getSimpleName());
ToolRunner.printGenericCommandUsage(System.err);
return -1;
}
Job job = new Job(getConf(), "Max temperature");
job.setJarByClass(getClass());
job.getConfiguration().setBoolean(
Job.MAPREDUCE_JOB_USER_CLASSPATH_FIRST, true);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
AvroJob.setMapOutputKeySchema(job, Schema.create(Schema.Type.INT));
AvroJob.setMapOutputValueSchema(job, SCHEMA);
AvroJob.setOutputKeySchema(job, SCHEMA);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(AvroKeyOutputFormat.class);
job.setMapperClass(MaxTemperatureMapper.class);
job.setReducerClass(MaxTemperatureReducer.class);
return job.waitForCompletion(true) ? 0 : 1;
}
public static void main(String[] args) throws Exception {
int exitCode = ToolRunner.run(new AvroGenericMaxTemperature(), args);
System.exit(exitCode);
}
}