1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
|
public
class
TextInputFormat
extends
FileInputFormat<LongWritable, Text>
implements
JobConfigurable {
private
CompressionCodecFactory compressionCodecs =
null
;
public
void
configure(JobConf conf) {
compressionCodecs =
new
CompressionCodecFactory(conf);
}
//是否可以被切分
//1、没压缩 return true
//2、压缩类是SplittableCompressionCodec的实例 return false 否则return false
protected
boolean
isSplitable(FileSystem fs, Path file) {
final
CompressionCodec codec = compressionCodecs.getCodec(file);
if
(
null
== codec) {
return
true
;
}
return
codec
instanceof
SplittableCompressionCodec;
}
//创建出LineRecordReader,其中会读取配置中的textinputformat.record.delimiter
//已确定每行的分隔符
public
RecordReader<LongWritable, Text> getRecordReader(
InputSplit genericSplit, JobConf job,
Reporter reporter)
throws
IOException {
reporter.setStatus(genericSplit.toString());
String delimiter = job.get(
"textinputformat.record.delimiter"
);
byte
[] recordDelimiterBytes =
null
;
if
(
null
!= delimiter) {
recordDelimiterBytes = delimiter.getBytes(Charsets.UTF_8);
}
return
new
LineRecordReader(job, (FileSplit) genericSplit,
recordDelimiterBytes);
}
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
|
private
SplitLineReader in;
//读取一行记录的实现类
public
LineRecordReader(Configuration job, FileSplit split,
byte
[] recordDelimiter)
throws
IOException {
this
.maxLineLength = job.getInt(org.apache.hadoop.mapreduce.lib.input.
LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE);
start = split.getStart();
//获取分片文件的启始位置
end = start + split.getLength();
//获取分片文件的结束位置
final
Path file = split.getPath();
//获取分片文件对应的完整文件
compressionCodecs =
new
CompressionCodecFactory(job);
codec = compressionCodecs.getCodec(file);
//获取压缩类
// open the file and seek to the start of the split
final
FileSystem fs = file.getFileSystem(job);
fileIn = fs.open(file);
if
(isCompressedInput()) {
//是否是压缩过的流
decompressor = CodecPool.getDecompressor(codec);
if
(codec
instanceof
SplittableCompressionCodec) {
//是否是可切分的压缩
final
SplitCompressionInputStream cIn =
((SplittableCompressionCodec)codec).createInputStream(
fileIn, decompressor, start, end,
SplittableCompressionCodec.READ_MODE.BYBLOCK);
in =
new
CompressedSplitLineReader(cIn, job, recordDelimiter);
start = cIn.getAdjustedStart();
end = cIn.getAdjustedEnd();
filePosition = cIn;
// take pos from compressed stream
}
else
{
in =
new
SplitLineReader(codec.createInputStream(fileIn,
decompressor), job, recordDelimiter);
filePosition = fileIn;
}
}
else
{
fileIn.seek(start);
in =
new
SplitLineReader(fileIn, job, recordDelimiter);
filePosition = fileIn;
}
// If this is not the first split, we always throw away first record
// because we always (except the last split) read one extra line in
// next() method.
//如果不是第一个分片,放弃读取第一行数据,因为每个分片(除了最后一个分片文件)会多读一行。至于为啥要这么做后面会解释。
if
(start !=
0
) {
start += in.readLine(
new
Text(),
0
, maxBytesToConsume(start));
}
this
.pos = start;
}
接下去客户端就可以调用来读取一行:
/** Read a line. */
public
synchronized
boolean
next(LongWritable key, Text value)
throws
IOException {
//同步方法,因为pos是全局可变量
// We always read one extra line, which lies outside the upper
// split limit i.e. (end - 1)
//总是多读一行,getFilePosition() <= end ,可见当等于end还会执行一次in.readLine
while
(getFilePosition() <= end || in.needAdditionalRecordAfterSplit()) {
key.set(pos);
//当前位置作为key
int
newSize = in.readLine(value, maxLineLength, Math.max(maxBytesToConsume(pos), maxLineLength));
if
(newSize ==
0
) {
return
false
;
}
pos += newSize;
if
(newSize < maxLineLength) {
return
true
;
}
// line too long. try again
LOG.info(
"Skipped line of size "
+ newSize +
" at pos "
+ (pos - newSize));
}
return
false
;
}
|
1
2
3
4
5
6
7
8
9
10
|
public
int
readLine(Text str,
int
maxLineLength,
int
maxBytesToConsume)
throws
IOException {
if
(
this
.recordDelimiterBytes !=
null
) {
//根据用户自定义的行分隔符读记录
return
readCustomLine(str, maxLineLength, maxBytesToConsume);
}
else
{
//根据默认行分隔符读记录
return
readDefaultLine(str, maxLineLength, maxBytesToConsume);
}
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
|
/**
* Read a line terminated by a custom delimiter.
*/
private
int
readCustomLine(Text str,
int
maxLineLength,
int
maxBytesToConsume)
throws
IOException {
/* We're reading data from inputStream, but the head of the stream may be
* already captured in the previous buffer, so we have several cases:
*
* 1. 缓冲区的尾部不包含行分隔符的任何字符。我们计ambiguousByteCount=0
*
*
* 2. 缓冲区的尾部包含X个字符序列,是X个字符是行分隔符的头部,我们计ambiguousByteCount=X
*
* // *** 例子:输入片段
*
* " record 1792: I found this bug very interesting and
* I have completely read about it. record 1793: This bug
* can be solved easily record 1794: This ."
*
* delimiter = "record";
*
* supposing:- String at the end of buffer =
* "I found this bug very interesting and I have completely re"
* There for next buffer = "ad about it. record 179 ...."
*
* The matching characters in the input
* buffer tail and delimiter head = "re"
* Therefore, ambiguous byte count = 2 **** //
*
* 2.1 If the following bytes are the remaining characters of
* the delimiter, then we have to capture only up to the starting
* position of delimiter. That means, we need not include the
* ambiguous characters in str.
*
* 2.2 If the following bytes are not the remaining characters of
* the delimiter ( as mentioned in the example ),
* then we have to include the ambiguous characters in str.
*/
str.clear();
int
txtLength =
0
;
// tracks str.getLength(), as an optimization
long
bytesConsumed =
0
;
int
delPosn =
0
;
int
ambiguousByteCount=
0
;
// To capture the ambiguous characters count
do
{
int
startPosn = bufferPosn;
// Start from previous end position
if
(bufferPosn >= bufferLength) {
startPosn = bufferPosn =
0
;
bufferLength = fillBuffer(in, buffer, ambiguousByteCount >
0
);
if
(bufferLength <=
0
) {
str.append(recordDelimiterBytes,
0
, ambiguousByteCount);
break
;
// EOF
}
}
for
(; bufferPosn < bufferLength; ++bufferPosn) {
if
(buffer[bufferPosn] == recordDelimiterBytes[delPosn]) {
delPosn++;
if
(delPosn >= recordDelimiterBytes.length) {
bufferPosn++;
break
;
}
}
else
if
(delPosn !=
0
) {
bufferPosn--;
delPosn =
0
;
}
}
int
readLength = bufferPosn - startPosn;
bytesConsumed += readLength;
int
appendLength = readLength - delPosn;
if
(appendLength > maxLineLength - txtLength) {
appendLength = maxLineLength - txtLength;
}
if
(appendLength >
0
) {
if
(ambiguousByteCount >
0
) {
str.append(recordDelimiterBytes,
0
, ambiguousByteCount);
//appending the ambiguous characters (refer case 2.2)
bytesConsumed += ambiguousByteCount;
ambiguousByteCount=
0
;
}
str.append(buffer, startPosn, appendLength);
txtLength += appendLength;
}
if
(bufferPosn >= bufferLength) {
if
(delPosn >
0
&& delPosn < recordDelimiterBytes.length) {
ambiguousByteCount = delPosn;
bytesConsumed -= ambiguousByteCount;
//to be consumed in next
}
}
}
while
(delPosn < recordDelimiterBytes.length
&& bytesConsumed < maxBytesToConsume);
if
(bytesConsumed > (
long
) Integer.MAX_VALUE) {
throw
new
IOException(
"Too many bytes before delimiter: "
+ bytesConsumed);
}
return
(
int
) bytesConsumed;
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
|
/**
* Read a line terminated by one of CR, LF, or CRLF.
*/
private
int
readDefaultLine(Text str,
int
maxLineLength,
int
maxBytesToConsume)
throws
IOException {
/* We're reading data from in, but the head of the stream may be
* already buffered in buffer, so we have several cases:
* 1. No newline characters are in the buffer, so we need to copy
* everything and read another buffer from the stream.
* 2. An unambiguously terminated line is in buffer, so we just
* copy to str.
* 3. Ambiguously terminated line is in buffer, i.e. buffer ends
* in CR. In this case we copy everything up to CR to str, but
* we also need to see what follows CR: if it's LF, then we
* need consume LF as well, so next call to readLine will read
* from after that.
* We use a flag prevCharCR to signal if previous character was CR
* and, if it happens to be at the end of the buffer, delay
* consuming it until we have a chance to look at the char that
* follows.
*/
//以上注释说明了如何处理
//UNIX: '\n' (LF)
//Mac: '\r' (CR)
//Windows: '\r\n' (CR)(LF)
str.clear();
int
txtLength =
0
;
//tracks str.getLength(), as an optimization
int
newlineLength =
0
;
//length of terminating newline
boolean
prevCharCR =
false
;
//true of prev char was CR
long
bytesConsumed =
0
;
do
{
int
startPosn = bufferPosn;
//starting from where we left off the last time
if
(bufferPosn >= bufferLength) {
startPosn = bufferPosn =
0
;
if
(prevCharCR) {
++bytesConsumed;
//account for CR from previous read
}
bufferLength = fillBuffer(in, buffer, prevCharCR);
if
(bufferLength <=
0
) {
break
;
// EOF
}
}
for
(; bufferPosn < bufferLength; ++bufferPosn) {
//search for newline
if
(buffer[bufferPosn] == LF) {
newlineLength = (prevCharCR) ?
2
:
1
;
++bufferPosn;
// at next invocation proceed from following byte
break
;
}
if
(prevCharCR) {
//CR + notLF, we are at notLF
newlineLength =
1
;
break
;
}
prevCharCR = (buffer[bufferPosn] == CR);
}
int
readLength = bufferPosn - startPosn;
if
(prevCharCR && newlineLength ==
0
) {
--readLength;
//CR at the end of the buffer
}
bytesConsumed += readLength;
int
appendLength = readLength - newlineLength;
if
(appendLength > maxLineLength - txtLength) {
appendLength = maxLineLength - txtLength;
}
if
(appendLength >
0
) {
str.append(buffer, startPosn, appendLength);
txtLength += appendLength;
}
}
while
(newlineLength ==
0
&& bytesConsumed < maxBytesToConsume);
if
(bytesConsumed > (
long
)Integer.MAX_VALUE) {
throw
new
IOException(
"Too many bytes before newline: "
+ bytesConsumed);
}
return
(
int
)bytesConsumed;
}
|
1
|
while
(newlineLength ==
0
&& bytesConsumed < maxBytesToConsume);
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
|
public
class
SequenceFileInputFormat<K, V>
extends
FileInputFormat<K, V> {
public
SequenceFileInputFormat() {
setMinSplitSize(SequenceFile.SYNC_INTERVAL);
}
@Override
protected
FileStatus[] listStatus(JobConf job)
throws
IOException {
FileStatus[] files =
super
.listStatus(job);
for
(
int
i =
0
; i < files.length; i++) {
FileStatus file = files[i];
if
(file.isDirectory()) {
// it's a MapFile
Path dataFile =
new
Path(file.getPath(), MapFile.DATA_FILE_NAME);
FileSystem fs = file.getPath().getFileSystem(job);
// use the data file
files[i] = fs.getFileStatus(dataFile);
}
}
return
files;
}
//创建出SequenceFileRecordReader
public
RecordReader<K, V> getRecordReader(InputSplit split,
JobConf job, Reporter reporter)
throws
IOException {
reporter.setStatus(split.toString());
return
new
SequenceFileRecordReader<K, V>(job, (FileSplit) split);
}
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
|
private
SequenceFile.Reader in;
public
SequenceFileRecordReader(Configuration conf, FileSplit split)
throws
IOException {
Path path = split.getPath();
FileSystem fs = path.getFileSystem(conf);
this
.in =
new
SequenceFile.Reader(fs, path, conf);
this
.end = split.getStart() + split.getLength();
this
.conf = conf;
if
(split.getStart() > in.getPosition())
in.sync(split.getStart());
// start位置定位到同步点,sync的说明见之前的Sequence File的介绍博文
this
.start = in.getPosition();
more = start < end;
}
public
synchronized
boolean
next(K key, V value)
throws
IOException {
if
(!more)
return
false
;
long
pos = in.getPosition();
boolean
remaining = (in.next(key) !=
null
);
if
(remaining) {
getCurrentValue(value);
}
if
(pos >= end && in.syncSeen()) {
more =
false
;
}
else
{
more = remaining;
}
return
more;
}
protected
synchronized
boolean
next(K key)
throws
IOException {
if
(!more)
return
false
;
long
pos = in.getPosition();
boolean
remaining = (in.next(key) !=
null
);
if
(pos >= end && in.syncSeen()) {
more =
false
;
}
else
{
more = remaining;
}
return
more;
}
protected
synchronized
void
getCurrentValue(V value)
throws
IOException {
in.getCurrentValue(value);
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
|
private
void
init(
boolean
tempReader)
throws
IOException {
byte
[] versionBlock =
new
byte
[VERSION.length];
in.readFully(versionBlock);
if
((versionBlock[
0
] != VERSION[
0
]) ||
(versionBlock[
1
] != VERSION[
1
]) ||
(versionBlock[
2
] != VERSION[
2
]))
throw
new
IOException(
this
+
" not a SequenceFile"
);
//判断是否是SEQ文件
// Set 'version'
version = versionBlock[
3
];
if
(version > VERSION[
3
])
//版本是否正确
throw
new
VersionMismatchException(VERSION[
3
], version);
if
(version < BLOCK_COMPRESS_VERSION) {
UTF8 className =
new
UTF8();
className.readFields(in);
keyClassName = className.toStringChecked();
// key class name
className.readFields(in);
valClassName = className.toStringChecked();
// val class name
}
else
{
keyClassName = Text.readString(in);
valClassName = Text.readString(in);
}
if
(version >
2
) {
// if version > 2
this
.decompress = in.readBoolean();
// is compressed?
}
else
{
decompress =
false
;
}
if
(version >= BLOCK_COMPRESS_VERSION) {
// if version >= 4
this
.blockCompressed = in.readBoolean();
// is block-compressed?
}
else
{
blockCompressed =
false
;
}
// if version >= 5
// setup the compression codec
if
(decompress) {
if
(version >= CUSTOM_COMPRESS_VERSION) {
String codecClassname = Text.readString(in);
try
{
Class<?
extends
CompressionCodec> codecClass
= conf.getClassByName(codecClassname).asSubclass(CompressionCodec.
class
);
this
.codec = ReflectionUtils.newInstance(codecClass, conf);
}
catch
(ClassNotFoundException cnfe) {
throw
new
IllegalArgumentException(
"Unknown codec: "
+
codecClassname, cnfe);
}
}
else
{
codec =
new
DefaultCodec();
((Configurable)codec).setConf(conf);
}
}
this
.metadata =
new
Metadata();
if
(version >= VERSION_WITH_METADATA) {
// if version >= 6
this
.metadata.readFields(in);
}
if
(version >
1
) {
// if version > 1
in.readFully(sync);
// read sync bytes
headerEnd = in.getPos();
// record end of header
}
// Initialize... *not* if this we are constructing a temporary Reader
if
(!tempReader) {
valBuffer =
new
DataInputBuffer();
if
(decompress) {
valDecompressor = CodecPool.getDecompressor(codec);
valInFilter = codec.createInputStream(valBuffer, valDecompressor);
valIn =
new
DataInputStream(valInFilter);
}
else
{
valIn = valBuffer;
}
if
(blockCompressed) {
keyLenBuffer =
new
DataInputBuffer();
keyBuffer =
new
DataInputBuffer();
valLenBuffer =
new
DataInputBuffer();
keyLenDecompressor = CodecPool.getDecompressor(codec);
keyLenInFilter = codec.createInputStream(keyLenBuffer,
keyLenDecompressor);
keyLenIn =
new
DataInputStream(keyLenInFilter);
keyDecompressor = CodecPool.getDecompressor(codec);
keyInFilter = codec.createInputStream(keyBuffer, keyDecompressor);
keyIn =
new
DataInputStream(keyInFilter);
valLenDecompressor = CodecPool.getDecompressor(codec);
valLenInFilter = codec.createInputStream(valLenBuffer,
valLenDecompressor);
valLenIn =
new
DataInputStream(valLenInFilter);
}
SerializationFactory serializationFactory =
new
SerializationFactory(conf);
this
.keyDeserializer =
getDeserializer(serializationFactory, getKeyClass());
if
(
this
.keyDeserializer ==
null
) {
throw
new
IOException(
"Could not find a deserializer for the Key class: '"
+ getKeyClass().getCanonicalName() +
"'. "
+
"Please ensure that the configuration '"
+
CommonConfigurationKeys.IO_SERIALIZATIONS_KEY +
"' is "
+
"properly configured, if you're using "
+
"custom serialization."
);
}
if
(!blockCompressed) {
this
.keyDeserializer.open(valBuffer);
}
else
{
this
.keyDeserializer.open(keyIn);
}
this
.valDeserializer =
getDeserializer(serializationFactory, getValueClass());
if
(
this
.valDeserializer ==
null
) {
throw
new
IOException(
"Could not find a deserializer for the Value class: '"
+ getValueClass().getCanonicalName() +
"'. "
+
"Please ensure that the configuration '"
+
CommonConfigurationKeys.IO_SERIALIZATIONS_KEY +
"' is "
+
"properly configured, if you're using "
+
"custom serialization."
);
}
this
.valDeserializer.open(valIn);
}
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
|
/** Read the next key/value pair in the file into <code>key</code> and
* <code>val</code>. Returns true if such a pair exists and false when at
* end of file */
public
synchronized
boolean
next(Writable key, Writable val)
throws
IOException {
if
(val.getClass() != getValueClass())
throw
new
IOException(
"wrong value class: "
+val+
" is not "
+valClass);
boolean
more = next(key);
//读取下一个key
if
(more) {
getCurrentValue(val);
//获取可以对应的val
}
return
more;
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
|
/** Read the next key in the file into <code>key</code>, skipping its
* value. True if another entry exists, and false at end of file. */
public
synchronized
boolean
next(Writable key)
throws
IOException {
if
(key.getClass() != getKeyClass())
throw
new
IOException(
"wrong key class: "
+key.getClass().getName()
+
" is not "
+keyClass);
if
(!blockCompressed) {
//不是块压缩
outBuf.reset();
keyLength = next(outBuf);
//获取key的长度
if
(keyLength <
0
)
return
false
;
valBuffer.reset(outBuf.getData(), outBuf.getLength());
key.readFields(valBuffer);
//读取key的内容
valBuffer.mark(
0
);
if
(valBuffer.getPosition() != keyLength)
throw
new
IOException(key +
" read "
+ valBuffer.getPosition()
+
" bytes, should read "
+ keyLength);
}
else
{
//Reset syncSeen
syncSeen =
false
;
if
(noBufferedKeys ==
0
) {
try
{
readBlock();
}
catch
(EOFException eof) {
return
false
;
}
}
int
keyLength = WritableUtils.readVInt(keyLenIn);
// Sanity check
if
(keyLength <
0
) {
return
false
;
}
//Read another compressed 'key'
key.readFields(keyIn);
--noBufferedKeys;
}
return
true
;
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
|
/**
* Get the 'value' corresponding to the last read 'key'.
* @param val : The 'value' to be read.
* @throws IOException
*/
public
synchronized
void
getCurrentValue(Writable val)
throws
IOException {
if
(val
instanceof
Configurable) {
((Configurable) val).setConf(
this
.conf);
}
// Position stream to 'current' value
seekToCurrentValue();
if
(!blockCompressed) {
val.readFields(valIn);
if
(valIn.read() >
0
) {
LOG.info(
"available bytes: "
+ valIn.available());
throw
new
IOException(val+
" read "
+(valBuffer.getPosition()-keyLength)
+
" bytes, should read "
+
(valBuffer.getLength()-keyLength));
}
}
else
{
// Get the value
int
valLength = WritableUtils.readVInt(valLenIn);
//获取val的长度
val.readFields(valIn);
//读取val值
// Read another compressed 'value'
--noBufferedValues;
// Sanity check
if
((valLength <
0
) && LOG.isDebugEnabled()) {
LOG.debug(val +
" is a zero-length value"
);
}
}
}
|
| 存储大小 | 查询效率 |
SeqFile | 大(要存key长度,value长度,sync等信息) | 好(应为有key长度,value长度等信息所以可以快速定位key或者value并读取) |
TextFile | 小 | 差(需要反复读直到遇到分隔符) |
1
2
3
4
5
6
7
8
|
CREATE
TABLE
u_data
( userid
INT
,
movieid
INT
,
rating
INT
,
unixtime STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED
BY
'\t'
STORED
AS
TEXTFILE;
LOAD
DATA
LOCAL
INPATH
'/usr/test/hivesourcedata/ml-100k/u.data'
OVERWRITE
INTO
TABLE
u_data;
|
1
2
3
4
5
6
7
8
|
CREATE
TABLE
u_data_seq
( userid
INT
,
movieid
INT
,
rating
INT
,
unixtime STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED
BY
'\t'
STORED
AS
SEQUENCEFILE;
INSERT
OVERWIRTE
TABLE
u_data_seq
SELECT
*
FROM
u.data;
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
|
public
class
HiveSequenceFileOutputFormat<K,V>
extends
SequenceFileOutputFormat<K,V>
implements
HiveOutputFormat<K, V> {
BytesWritable EMPTY_KEY =
new
BytesWritable();
/**
* create the final out file, and output an empty key as the key.
*
* @param jc
* the job configuration file
* @param finalOutPath
* the final output file to be created
* @param valueClass
* the value class used for create
* @param isCompressed
* whether the content is compressed or not
* @param tableProperties
* the tableInfo of this file's corresponding table
* @param progress
* progress used for status report
* @return the RecordWriter for the output file
*/
@Override
public
RecordWriter getHiveRecordWriter(JobConf jc, Path finalOutPath,
Class<?
extends
Writable> valueClass,
boolean
isCompressed,
Properties tableProperties, Progressable progress)
throws
IOException {
FileSystem fs = finalOutPath.getFileSystem(jc);
final
SequenceFile.Writer outStream = Utilities.createSequenceWriter(jc,
fs, finalOutPath, BytesWritable.
class
, valueClass, isCompressed);
return
new
RecordWriter() {
@Override
public
void
write(Writable r)
throws
IOException {
outStream.append(EMPTY_KEY, r);
//key是EMPTY_KEY
}
@Override
public
void
close(
boolean
abort)
throws
IOException {
outStream.close();
}
};
}
}
|