前言:
仅仅针对权威指南的吐槽,可跳过。说到这个问题实在是让人崩溃,开始看hadoop权威指南中文第四版中的9.2.4案例,求每年的最高温,要求年份按照升序,温度降序。针对组合键告诉我自定义分区只能保证每一个 reduce 接受一个年份的所有记录,而在一个分区之内,reduce 仍是通过建进行分组的分区,所以应该自定义分组。但是自定义分组前后结果配图竟然完全一样!?这是什么迷惑行为。此外,针对分区分组之后reduce 方法中直接使用context.write(key,NullWriteable.get())就能输出最高温度,感到奇怪。
MR代码实现:
1.定义自然键和自然值的组合键,该类必须实现WriteableComparable。
import java.io.*;
import org.apache.hadoop.io.*;
public class IntPair implements WritableComparable<IntPair> {
private int first;
private int second;
public IntPair() {
}
public IntPair(int first, int second) {
set(first, second);
}
public void set(int first, int second) {
this.first = first;
this.second = second;
}
public int getFirst() {
return first;
}
public int getSecond() {
return second;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(first);
out.writeInt(second);
}
@Override
public void readFields(DataInput in) throws IOException {
first = in.readInt();
second = in.readInt();
}
@Override
public int hashCode() {
return first * 163 + second;
}
@Override
public boolean equals(Object o) {
if (o instanceof IntPair) {
IntPair ip = (IntPair) o;
return first == ip.first && second == ip.second;
}
return false;
}
@Override
public String toString() {
return first + "\t" + second;
}
@Override
public int compareTo(IntPair ip) {
int cmp = compare(first, ip.first);
if (cmp != 0) {
return cmp;
}
return compare(second, ip.second);
}
/**
* Convenience method for comparing two ints.
*/
public static int compare(int a, int b) {
return (a < b ? -1 : (a == b ? 0 : 1));
}
}
import java.io.IOException;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
// vv MaxTemperatureUsingSecondarySort
public class MaxTemperatureUsingSecondarySort
extends Configured implements Tool {
static class MaxTemperatureMapper
extends Mapper<LongWritable, Text, IntPair, NullWritable> {
private NcdcRecordParser parser = new NcdcRecordParser();
@Override
protected void map(LongWritable key, Text value,
Context context) throws IOException, InterruptedException {
parser.parse(value);
if (parser.isValidTemperature()) {
/*[*/context.write(new IntPair(parser.getYearInt(),
parser.getAirTemperature()), NullWritable.get());/*]*/
}
}
}
static class MaxTemperatureReducer
extends Reducer<IntPair, NullWritable, IntPair, NullWritable> {
@Override
protected void reduce(IntPair key, Iterable<NullWritable> values,
Context context) throws IOException, InterruptedException {
/*[*/context.write(key, NullWritable.get());/*]*/
}
}
public static class FirstPartitioner
extends Partitioner<IntPair, NullWritable> {
@Override
public int getPartition(IntPair key, NullWritable value, int numPartitions) {
// multiply by 127 to perform some mixing
return Math.abs(key.getFirst() * 127) % numPartitions;
}
}
public static class KeyComparator extends WritableComparator {
protected KeyComparator() {
super(IntPair.class, true);
}
@Override
public int compare(WritableComparable w1, WritableComparable w2) {
IntPair ip1 = (IntPair) w1;
IntPair ip2 = (IntPair) w2;
int cmp = IntPair.compare(ip1.getFirst(), ip2.getFirst());
if (cmp != 0) {
return cmp;
}
return -IntPair.compare(ip1.getSecond(), ip2.getSecond()); //reverse
}
}
public static class GroupComparator extends WritableComparator {
protected GroupComparator() {
super(IntPair.class, true);
}
@Override
public int compare(WritableComparable w1, WritableComparable w2) {
IntPair ip1 = (IntPair) w1;
IntPair ip2 = (IntPair) w2;
return IntPair.compare(ip1.getFirst(), ip2.getFirst());
}
}
@Override
public int run(String[] args) throws Exception {
Job job = JobBuilder.parseInputAndOutput(this, getConf(), args);
if (job == null) {
return -1;
}
job.setMapperClass(MaxTemperatureMapper.class);
/*[*/job.setPartitionerClass(FirstPartitioner.class);/*]*/
/*[*/job.setSortComparatorClass(KeyComparator.class);/*]*/
/*[*/job.setGroupingComparatorClass(GroupComparator.class);/*]*/
job.setReducerClass(MaxTemperatureReducer.class);
job.setOutputKeyClass(IntPair.class);
job.setOutputValueClass(NullWritable.class);
return job.waitForCompletion(true) ? 0 : 1;
}
public static void main(String[] args) throws Exception {
int exitCode = ToolRunner.run(new MaxTemperatureUsingSecondarySort(), args);
System.exit(exitCode);
}
}
// ^^ MaxTemperatureUsingSecondarySort
流程:
1.TextInputFormat 会调用RecordReader 一行行读取文本,以行号为键,文本内容为值。将一个个<LongWritable,Text>通过map转换为<IntPair,NullWritable>,最终形成List<IntPair,NullWritable>
2.在mapper的最后,会先调用job.setPartitionerClass对这个List进行分区,每个分区映射到一个reducer。每个分区内又调用job.setSortComparatorClass设置的keyComparator(覆盖了IntPair中的定义),最终形成第一列升序,第二列降序
3.reducer 接受到所有映射到这个reducer的map输入后,调用job.setSortComparatorClass设置的keyComparator对所有数据对排序。经过排序,第一列值相同的IntPair会被放在一起。开始构造key对应得value迭代器,此时用到分组比较器,使用jobjob.setGroupingComparatorClass设置的分组函数类。
4.在reducer的context中(实际是ReducerContextImpl类)中GroupComparator会判断当前中间输入的key(本例是IntPair对象)是否和下一个key相同。如果比较器返回0(代表相同,非0代表不同),则reducer的run会一直在context的nextkey->nextkeyvalue方法中调用该比较器,直到比较器返回非0。而每次nextkeyvalue的调用,都会更新context的当前key->value对的值,然后在run中,调用reduce方法,传入context的当前key->value对。所以如本例中,中间输出中两列都是升序排列,则context会一直调用groupcomparator并更新当前key->value,直到非0时调用reduce方法传入当前key->value。(只要这个比较器比较的两个key相同,他们就属于同一个组)
5.进入Reducer的reduce方法,reduce方法的输入是所有的key和它的value迭代器。
6.分区的目的是根据Key值决定Mapper的输出记录被送到哪一个Reducer上去处理,key 比较器决定了key的排序方式,group比较器决定了哪些数据可以被划分为一组,同时进入到一个reduce 方法。
7.key比较器还有一个重要功能,就是如果job没有设定group比较器,则key比较器将充当group比较器的角色(如果设置了key比较器的话),也即mapper过程和reducer过程都会用key比较器。
8.在实现比较器时,在构造函数中,一定要调用super(XXX.class,true),XXX是你用来比较的类的名称,true是表示生成该类对象(在比较器内部)。之后比较时,比较器会用输入数据的字节值进行反序列化来给内部XXX类对象赋值,之后该属性对象被传递到比较器的compareTo函数。如果没有以上步骤,运行时会抛出空指针异常。
9.回到原问题,为什么context.write(key,NullWriteable.get())就能输出最高温度?并不是分组到最后就能输出一个结果。在最后进入到reduce方法的结果为[1900,35],[1900,34],[1900,33]等key值以及空值迭代器。因为[1900,35]排列在第一个,而一组中context.write(key,NullWriteable.get())也只写了一次。
10.所以想要求出每组TOPK应该怎么做:
static class MaxTemperatureReducer
extends Reducer<IntPair, NullWritable, IntPair, NullWritable> {
@Override
protected void reduce(IntPair key, Iterable<NullWritable> values,
Context context) throws IOException, InterruptedException {
int i=0;
for(NullWritable value:values){
i++;
if(i<=k)
/*[*/context.write(key, NullWritable.get());/*]*/
else
break;
}
}
}
hive 实现top n
hive 中提供了具体的排序函数,所以总体实现是十分简单的。
row_number() over ([partition col1] [order by col2])
rank() over ([partition col1] [order by col2])
dense_rank() over ([partition col1] [order by col2])
它们都是根据col1字段分组,然后对col2字段进行排序,对排序后的每行生成一个行号,这个行号从1开始递增
col1、col2都可以是多个字段,用','分隔
区别:
1)row_number:不管col2字段的值是否相等,行号一直递增,比如:有两条记录的值相等,但一个是第一,一个是第二
2)rank:上下两条记录的col2相等时,记录的行号是一样的,但下一个col2值的行号递增N(N是重复的次数),比如:有两条并列第一,下一个是第三,没有第二
3)dense_rank:上下两条记录的col2相等时,下一个col2值的行号递增1,比如:有两条并列第一,下一个是第二
其中 row_number 还可以用between ... and ... 实现分页查询。
create table yt(
year String,
temperature int
)row format delimited fields terminated by '\t';
--求每年最高温前三
select * from (select *,row_number over(partition by year order by temperature desc) as tem from yt)t where tem >=3;
--求1900年 35度的排名
select tem from (select * ,rank_number over(partition by year order by temperature desc) as tem from yt )t where year='1900' and temperature=35;
Mysql 分组求 top N
mysql 中的top n 问题实际上转化为当前记录中某列数据在所有记录中的排名,即如果存在N列数据比当前列数据大,那么当前数据的排名即为N+1(相同大小的排名相同)
数据准备:
DROP TABLE IF EXISTS `info`;
CREATE TABLE `info` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`title` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL,
`views` int(255) DEFAULT NULL,
`info_type_id` int(11) DEFAULT NULL,
PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Dynamic;
-- ----------------------------
-- Records of info
-- ----------------------------
INSERT INTO `info` VALUES (1, '罗志祥黑眼圈', 10, 4);
INSERT INTO `info` VALUES (2, '新冠肺炎', 22, 4);
INSERT INTO `info` VALUES (3, '流浪地球电影大火', 188, 1);
INSERT INTO `info` VALUES (4, '葛优瘫', 99, 2);
INSERT INTO `info` VALUES (5, '周杰伦出新歌了', 877, 2);
INSERT INTO `info` VALUES (6, '蔡依林西安演唱会', 86, 1);
INSERT INTO `info` VALUES (7, '中纪委调盐', 67, 3);
INSERT INTO `info` VALUES (8, '人民大会堂', 109, 3);
INSERT INTO `info` VALUES (9, '重庆称为网红城市', 202, 1);
INSERT INTO `info` VALUES (10, '胡歌结婚了', 300, 2);
INSERT INTO `info` VALUES (11, 'ipone15马上上市', 678, 2);
INSERT INTO `info` VALUES (12, '中国探月成功', 54, 4);
INSERT INTO `info` VALUES (13, 'Trump演说家', 67, 4);
DROP TABLE IF EXISTS `info_type`;
CREATE TABLE `info_type` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`name` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL,
PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Dynamic;
-- ----------------------------
-- Records of info_type
-- ----------------------------
INSERT INTO `info_type` VALUES (1, '娱乐');
INSERT INTO `info_type` VALUES (2, '八卦');
INSERT INTO `info_type` VALUES (3, '政治');
需求:取热门的资讯信息列表且每个类别只取前3条
SELECT t.* from (
SELECT
t1.*,
(SELECT count(*) + 1 FROM info t2 WHERE t2.info_type_id = t1.info_type_id AND t2.views > t1.views ) top
FROM
info t1
) t where top <=3 order by t.info_type_id,top
参考链接 :
https://zhuanlan.zhihu.com/p/62404730
https://blog.youkuaiyun.com/xiaocaidexuexibiji/article/details/12125699