hadoop之编程题

最新推荐文章于 2024-05-28 14:41:13 发布

默默在路上

最新推荐文章于 2024-05-28 14:41:13 发布

阅读量1.9k

点赞数

分类专栏：面试题之Hadoop

面试题之Hadoop 专栏收录该内容

2 篇文章

订阅专栏

1 共同朋友

思路：例如A，他的朋友是B\C\D\E\F\，那么BC的共同朋友就是A。所以将BC作为key，将A作为value，在map端输出即可！其他的朋友循环处理。

import java.io.IOException;

import java.util.Set;

import java.util.StringTokenizer;

import java.util.TreeSet;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

10. import org.apache.hadoop.mapreduce.Mapper;

11. import org.apache.hadoop.mapreduce.Reducer;

12. import org.apache.hadoop.mapreduce.Mapper.Context;

13. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

14. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

15. import org.apache.hadoop.util.GenericOptionsParser;

16.

17. public class FindFriend {

www.aboutyun.com

18.

19. public static class ChangeMapper extends Mapper<Object, Text, Text,

Text>{

20. @Override

21. public void map(Object key, Text value, Context context) throws

IOException, InterruptedException {

22. StringTokenizer itr = new StringTokenizer(value.toString());

23. Text owner = new Text();

24. Set<String> set = new TreeSet<String>();

25. owner.set(itr.nextToken());

26. while (itr.hasMoreTokens()) {

27. set.add(itr.nextToken());

28. }

29. String[] friends = new String[set.size()];

30. friends = set.toArray(friends);

31.

32. for(int i=0;i<friends.length;i++){

33. for(int j=i+1;j<friends.length;j++){

34. String outputkey = friends[i]+friends[j];

35. context.write(new Text(outputkey),owner);

36. }

37. }

38. }

39. }

40.

41. public static class FindReducer extends Reducer<Text,Text,Text,Text>

{

42. public void reduce(Text key, Iterable<Text> values,

43. Context context) throws IOException,

InterruptedException {

44. String commonfriends ="";

www.aboutyun.com

45. for (Text val : values) {

46. if(commonfriends == ""){

47. commonfriends = val.toString();

48. }else{

49. commonfriends =

commonfriends+":"+val.toString();

50. }

51. }

52. context.write(key, new

Text(commonfriends));

53. }

54. }

55.

56.

57. public static void main(String[] args) throws IOException,

58. InterruptedException, ClassNotFoundException {

59.

60. Configuration conf = new Configuration();

61. String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

62. if (otherArgs.length < 2) {

63. System.err.println("args error");

64. System.exit(2);

65. }

66. Job job = new Job(conf, "word count");

67. job.setJarByClass(FindFriend.class);

68. job.setMapperClass(ChangeMapper.class);

69. job.setCombinerClass(FindReducer.class);

70. job.setReducerClass(FindReducer.class);

71. job.setOutputKeyClass(Text.class);

72. job.setOutputValueClass(Text.class);

73. for (int i = 0; i < otherArgs.length - 1; ++i) {

www.aboutyun.com

74. FileInputFormat.addInputPath(job, new Path(otherArgs[i]));

75. }

76. FileOutputFormat.setOutputPath(job,

77. new Path(otherArgs[otherArgs.length - 1]));

78. System.exit(job.waitForCompletion(true) ? 0 : 1);

79.

80. }

81.

82. }

结果：

1. AB E:C:D

2. AC E:B

3. AD B:E

4. AE C:B:D

5. BC A:E

6. BD A:E

7. BE C:D:A

8. BF A

9. CD E:A:B

10. CE A:B

11. CF A

12. DE B:A

13. DF A

14. EF A

2 基站逗留时间

需求：

期望：

思路：

将数据导入hive表中，查询时，用电话号码和时间排序即可！

3 脚本替换

脚本：随意命名为aaa.sh

#!/bin/bash

ls $1 | while read line

sed -i 's,\$HADOOP_HOME\$,\/home\/aa,g' $1$line

echo $1$line

done

脚本执行命令：替换/home/hadoop/test/下的所有文件

./aaa.sh /home/hadoop/test/

4 一键执行

脚本：

vi runRemoteCmd.sh

#!/bin/bash

ssh -q hadoop@slave1 "$1"

ssh -q hadoop@slave2 "$1"

执行命令

./runRemoteCmd.sh "ls -l"

请用java实现非递归二分查询

1. public class BinarySearchClass

2. {

4. public static int binary_search(int[] array, int value)

5. {

6. int beginIndex = 0;// 低位下标

7. int endIndex = array.length - 1;// 高位下标

8. int midIndex = -1;

9. while (beginIndex <= endIndex) {

10. midIndex = beginIndex + (endIndex - beginIndex) / 2;//防止溢出

11. if (value == array[midIndex]) {

12. return midIndex;

13. } else if (value < array[midIndex]) {

14. endIndex = midIndex - 1;

15. } else {

16. beginIndex = midIndex + 1;

17. }

18. }

19. return -1;

20. //找到了，返回找到的数值的下标，没找到，返回-1

21. }

22.

23.

24. //start 提示：自动阅卷起始唯一标识，请勿删除或增加。

25. public static void main(String[] args)

26. {

27. System.out.println("Start...");

28. int[] myArray = new int[] { 1, 2, 3, 5, 6, 7, 8, 9 };

29. System.out.println("查找数字8的下标：");

30. System.out.println(binary_search(myArray, 8));

31. }

32. //end //提示：自动阅卷结束唯一标识，请勿删除或增加。

33. }

4.0 当前日志采样格式为

a , b , c , d

b , b , f , e

a , a , c , f

请你用最熟悉的语言编写mapreduce，计算第四列每个元素出现的个数

答：

public classWordCount1 {

public static final String INPUT_PATH ="hdfs://hadoop0:9000/in";

public static final String OUT_PATH ="hdfs://hadoop0:9000/out";

public static void main(String[] args)throws Exception {

Configuration conf = newConfiguration();

FileSystem fileSystem =FileSystem.get(conf);

if(fileSystem.exists(newPath(OUT_PATH))){}

fileSystem.delete(newPath(OUT_PATH),true);

Job job = newJob(conf,WordCount1.class.getSimpleName());

//1.0读取文件，解析成key,value对

FileInputFormat.setInputPaths(job,newPath(INPUT_PATH));

//2.0写上自己的逻辑，对输入的可以，value进行处理，转换成新的key,value对进行输出

job.setMapperClass(MyMapper.class);

job.setMapOutputKeyClass(Text.class);

job.setMapOutputValueClass(LongWritable.class);

//3.0对输出后的数据进行分区

//4.0对分区后的数据进行排序，分组，相同key的value放到一个集合中

//5.0对分组后的数据进行规约

//6.0对通过网络将map输出的数据拷贝到reduce节点

//7.0 写上自己的reduce函数逻辑，对map输出的数据进行处理

job.setReducerClass(MyReducer.class);

job.setOutputKeyClass(Text.class);

job.setOutputValueClass(LongWritable.class);

FileOutputFormat.setOutputPath(job,new Path(OUT_PATH));

job.waitForCompletion(true);

}

static class MyMapper extendsMapper<LongWritable, Text, Text, LongWritable>{

@Override

protected void map(LongWritable k1,Text v1,

org.apache.hadoop.mapreduce.Mapper.Contextcontext)

throws IOException,InterruptedException {

String[] split =v1.toString().split("\t");

for(String words :split){

context.write(split[3],1);

}

static class MyReducer extendsReducer<Text, LongWritable, Text, LongWritable>{

protected void reduce(Text k2,Iterable<LongWritable> v2,

org.apache.hadoop.mapreduce.Reducer.Contextcontext)

throws IOException,InterruptedException {

Long count = 0L;

for(LongWritabletime : v2){

count += time.get();

}

context.write(v2, newLongWritable(count));

}

1 使用Hive或者自定义MR实现如下逻辑

product_no lac_id moment start_time user_id county_id staytime city_id

13429100031 22554 8 2013-03-11 08:55:19.151754088 571 571 282 571

13429100082 22540 8 2013-03-11 08:58:20.152622488 571 571 270 571

13429100082 22691 8 2013-03-11 08:56:37.149593624 571 571 103 571

13429100087 22705 8 2013-03-11 08:56:51.139539816 571 571 220 571

13429100087 22540 8 2013-03-11 08:55:45.150276800 571 571 66 571

13429100082 22540 8 2013-03-11 08:55:38.140225200 571 571 133 571

13429100140 26642 9 2013-03-11 09:02:19.151754088 571 571 18 571

13429100082 22691 8 2013-03-11 08:57:32.151754088 571 571 287 571

13429100189 22558 8 2013-03-11 08:56:24.139539816 571 571 48 571

13429100349 22503 8 2013-03-11 08:54:30.152622440 571 571 211 571

字段解释：

product_no：用户手机号；

lac_id：用户所在基站；

start_time：用户在此基站的开始时间；

staytime：用户在此基站的逗留时间。

需求描述：

根据lac_id和start_time知道用户当时的位置，根据staytime知道用户各个基站的逗留时长。根据轨迹合并连续基站的staytime。

最终得到每一个用户按时间排序在每一个基站驻留时长

期望输出举例：

13429100082 22540 8 2013-03-11 08:58:20.152622488 571 571 270 571

13429100082 22691 8 2013-03-11 08:56:37.149593624 571 571 390 571

13429100082 22540 8 2013-03-11 08:55:38.140225200 571 571 133 571

13429100087 22705 8 2013-03-11 08:56:51.139539816 571 571 220 571

13429100087 22540 8 2013-03-11 08:55:45.150276800 571 571 66 571

答案

package org.aboutyun;

import org.apache.commons.lang.StringUtils;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

importorg.apache.hadoop.mapreduce.lib.input.FileInputFormat;

importorg.apache.hadoop.mapreduce.lib.input.TextInputFormat;

importorg.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

importorg.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import java.io.IOException;

import java.text.ParseException;

import java.text.SimpleDateFormat;

import java.util.ArrayList;

import java.util.Collections;

import java.util.Comparator;

public class TimeCount {

publicstatic void main(String[] args) throws Exception {

Configuration conf = new Configuration();

Jobjob = new Job(conf, "time_count");

job.setOutputKeyClass(Text.class);

job.setOutputValueClass(Text.class);

job.setMapperClass(Map.class);

job.setReducerClass(Reduce.class);

job.setInputFormatClass(TextInputFormat.class);

job.setOutputFormatClass(TextOutputFormat.class);

FileInputFormat.addInputPath(job, new Path(args[0]));

FileOutputFormat.setOutputPath(job, new Path(args[1]));

job.waitForCompletion(true);

}

publicstatic class Map extends Mapper<LongWritable, Text, Text, Text> {

private Text id = new Text();

private Text row = new Text();

public void map(LongWritable key, Text value, Context context) throwsIOException, InterruptedException {

String line = value.toString();

String[] items = line.split("\t");

if (items.length == 8) {

if (StringUtils.isNumeric(items[6])) {

id.set(items[0] +"\t" + items[1]);

row.set(line);

context.write(id, row);

}

} else {

System.out.println("Wrong length: " + items.length);

}

publicstatic class Reduce extends Reducer<Text, Text, Text, Text> {

private static final SimpleDateFormat format = newSimpleDateFormat("yyyy-MM-dd HH:mm:ss");

static {

format.setLenient(false);

}

private Text rest = new Text();

public void reduce(Text key, Iterable<Text> values, Contextcontext)

throws IOException, InterruptedException {

// Parse row to Record

ArrayList<Record> list = new ArrayList<Record>();

for (Text row : values) {

String[] items = row.toString().split("\t");

try {

Record record = newRecord();

record.items = items;

record.start_time =format.parse(items[3]).getTime();

record.stay_time =Long.parseLong(items[6]) * 1000;

list.add(record);

} catch (ParseException e) {

e.printStackTrace();

}

// Sort

Collections.sort(list, newComparator<Record>() {

@Override

public int compare(Record r1, Record r2) {

return (int) (r1.start_time- r2.start_time);

}

});

// Find and merge slice

ArrayList<Record> result = new ArrayList<Record>();

for (Record r1 : list) {

boolean found = false;

long r1_stop_time = r1.start_time + r1.stay_time;

for (Record r2 : result) {

long r2_stop_time =r2.start_time + r2.stay_time;

if (r1.start_time >r2.start_time && r1.start_time <= r2_stop_time &&r1_stop_time > r2_stop_time) {

// merge the new slice

r2.stay_time =r1_stop_time - r2.start_time;

found = true;

}

if (!found) {

result.add(r1);

}

// Output

for (Record r : result) {

key.set(r.items[0]);

String value = r.items[1] + "\t"

+ r.items[2] +"\t"

+ r.items[3] +"\t"

+ r.items[4] + "\t"

+ r.items[5] +"\t"

+ (r.stay_time / 1000)+ "\t"

+ r.items[6] +"\t";

rest.set(value);

context.write(key, rest);

}

static class Record {

String[] items;

long start_time;

long stay_time;

}

2 Linux脚本能力考察

2.1 请随意使用各种类型的脚本语言实现：批量将指定目录下的所有文件中的$HADOOP_HOME$替换成/home/ocetl/app/hadoop

2.2 假设有10台主机，H1到H10，在开启SSH互信的情况下，编写一个或多个脚本实现在所有的远程主机上执行脚本的功能

例如：runRemoteCmd.sh "ls -l"

期望结果：

H1:

XXXXXXXX

H2:

XXXXXXXX

H3:

...

答案

2.1 使用 find + sed 来实现：
find /home/ocetl/app/hadoop -exec sed -i's/\$HADOOP_HOME\$/\/home\/ocetl\/app\/hadoop/g' {} \;
2.2 直接使用ssh的参数

1. #!/bin/bash

2. if [ $# -ne 1 ]

3. then

4. echo "Usage:`basename $0` {command}"

5. exit

6. fi

8. for i in H1 H2 H3 H4 H5 H6 H7 H8 H9 H10

9. do

10. echo"$i:"

11. ssh $i"$1"

12. done

复制代码

3 Hadoop基础知识与问题分析的能力

3.1 描述一下hadoop中，有哪些地方使用了缓存机制，作用分别是什么

3.2 请描述https://issues.apache.org/jira/browse/HDFS-2379说的是什么问题，最终解决的思路是什么？

3.1 不了解，HDFS用了缓存
3.2 问题是当硬盘空间很大，而内存页面缓存很少的时候，DN的Block report需要很长时间生成，而此时 FSVolumeSet 锁是锁住的，因此所有读写操作都无法执行，最终导致那些操作超时。此问题是建议提供一种方法使block report不需要持有FSVolumeSet锁，从而不会导致那些任务失败。

4 MapReduce开发能力

请参照wordcount实现一个自己的map reduce，需求为：

a 输入文件格式：

xxx,xxx,xxx,xxx,xxx,xxx,xxx

b 输出文件格式：

xxx,20

xxx,30

xxx.40

c 功能：根据命令行参数统计输入文件中指定关键字出现的次数，并展示出来

例如：hadoop jar xxxxx.jar keywordcount xxx,xxx,xxx,xxx(四个关键字）

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

importorg.apache.hadoop.mapreduce.lib.input.FileInputFormat;

importorg.apache.hadoop.mapreduce.lib.input.TextInputFormat;

importorg.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

importorg.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import java.io.IOException;

import java.util.ArrayList;

public class WordCount {

publicstatic class Map extends Mapper<LongWritable, Text, Text, IntWritable> {

private final static IntWritable one = new IntWritable(1);

private Text word = new Text();

private final static ArrayList<String> target_words = newArrayList<String>();

public void map(LongWritable key, Text value, Context context) throwsIOException, InterruptedException {

String[] items =value.toString().toLowerCase().replaceAll("\\p{Punct}","").split("\\s+");

for (String item : items) {

if (target_words.contains(item)) {

word.set(item);

context.write(word, one);

}

public static void clear() {

target_words.clear();

}

public static void add(String word) {

target_words.add(word);

}

publicstatic class Reduce extends Reducer<Text, IntWritable, Text, IntWritable>{

public void reduce(Text key, Iterable<IntWritable> values, Contextcontext)

throws IOException, InterruptedException {

int sum = 0;

for (IntWritable val : values) {

sum += val.get();

}

context.write(key, new IntWritable(sum));

}

publicstatic void main(String[] args) throws Exception {

Configuration conf = new Configuration();

if(args.length < 3) {

System.out.println("Usage: wordcount <input_path><output_path> <keyword_list>");

return;

}

// Add to target

String[] target_words = args[2].split(",");

for(String word : target_words) {

Map.add(word.toLowerCase());

}

Jobjob = new Job(conf, "wordcount");

job.setOutputKeyClass(Text.class);

job.setOutputValueClass(IntWritable.class);

job.setMapperClass(Map.class);

job.setReducerClass(Reduce.class);

job.setInputFormatClass(TextInputFormat.class);

job.setOutputFormatClass(TextOutputFormat.class);

FileInputFormat.addInputPath(job, new Path(args[0]));

FileOutputFormat.setOutputPath(job, new Path(args[1]));

job.waitForCompletion(true);

}

5 MapReduce优化

请根据第五题中的程序, 提出如何优化MR程序运行速度的思路

6 Linux操作系统知识考察

请列举曾经修改过的/etc下的配置文件，并说明修改要解决的问题？

hosts：增加局域网主机名和ip对应关系，省得再记住ip;
      hostname：该主机名，克隆虚拟机的时候经常需要这么做；
      fstab：修改挂载点，加新硬盘的时候会需要；
      profile,bash.bashrc: 修改系统范围环境变量时经常用；
     network/interfaces：配置静态IP时需要。