一道hadoop面试题，hadoop面试题

文章由LinuxBoy分享于2019-03-27 10:03:16热评（60）

一道hadoop面试题，hadoop面试题

这题是网上找的，如果做的不对，请大家指正。

1 使用Hive或者自定义MR实现如下逻辑 product_no lac_id moment start_time user_id county_id staytime city_id 13429100031 22554 8 2013-03-11 08:55:19.151754088 571 571 282 571 13429100082 22540 8 2013-03-11 08:58:20.152622488 571 571 270 571 13429100082 22691 8 2013-03-11 08:56:37.149593624 571 571 103 571 13429100087 22705 8 2013-03-11 08:56:51.139539816 571 571 220 571 13429100087 22540 8 2013-03-11 08:55:45.150276800 571 571 66 571 13429100082 22540 8 2013-03-11 08:55:38.140225200 571 571 133 571 13429100140 26642 9 2013-03-11 09:02:19.151754088 571 571 18 571 13429100082 22691 8 2013-03-11 08:57:32.151754088 571 571 287 571 13429100189 22558 8 2013-03-11 08:56:24.139539816 571 571 48 571 13429100349 22503 8 2013-03-11 08:54:30.152622440 571 571 211 571 字段解释： product_no：用户手机号； lac_id：用户所在基站； start_time：用户在此基站的开始时间； staytime：用户在此基站的逗留时间。
需求描述：根据lac_id和start_time知道用户当时的位置，根据staytime知道用户各个基站的逗留时长。根据轨迹合并连续基站的staytime。最终得到每一个用户按时间排序在每一个基站驻留时长
期望输出举例： 13429100082 22540 8 2013-03-11 08:58:20.152622488 571 571 270 571 13429100082 22691 8 2013-03-11 08:56:37.149593624 571 571 390 571 13429100082 22540 8 2013-03-11 08:55:38.140225200 571 571 133 571 13429100087 22705 8 2013-03-11 08:56:51.139539816 571 571 220 571 13429100087 22540 8 2013-03-11 08:55:45.150276800 571 571 66 571

说说我的思路：先按照TextInputFormat进行map，在map函数中再对每一行处理将手机号作为map的outputkey，行内容为outputvalue。在reduce的是按照时间排序。

package hadoop;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class HadoopTest1 
{
	public static String split = " +|\t";  //定义一个分隔符，空格和tab都可以
	
	public static class MyComarator implements Comparator   //由于不是按照整个字符串比较，所以实现一个Comparator接口，按时间来比较
	{
		@Override
		public int compare(Object o1, Object o2) 
		{
			// TODO Auto-generated method stub
			String str1 = (String)o1;
			String str2 = (String)o2;
			
			String []arr1 = str1.split(split);
			String []arr2 = str2.split(split);
			
			return (arr1[3] + arr1[4]).compareTo((arr2[3] + arr2[4]));
		}
	}
	
	public static class MyMapper extends Mapper<LongWritable, Text, Text, Text>
	{
		public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException
		{
			if (key.equals(new LongWritable(0)))  //过滤掉第一行
			{
				return;
			}
			String line = value.toString();
			String[] elements = line.split(split);
			context.write(new Text(elements[0]), value);
		}
	}
	public static class MyReducer extends Reducer<Text, Text, NullWritable, Text>
	{
		public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException
		{
			List<String>list = new ArrayList<String>();
			
			for (Text v : values)
			{
				list.add(v.toString());
			}
			
			list.sort(new MyComarator());
			Collections.reverse(list);
			
			for (int i =0; i < list.size(); ++i)
			{
				context.write(NullWritable.get(), new Text(list.get(i)));
			}
		}
	}
	
	public static void main(String[] args)
	{
		String HDFS_PATH = "hdfs://master:9000";
		String INPUT_PATH = "/home/hadoop/hadoop-data/20150721/input";
		String OUTT_PATH  = "/home/hadoop/hadoop-data/20150721/output";
		
		try
		{
			FileSystem fs = FileSystem.get(new URI(HDFS_PATH), new Configuration());
			FSDataOutputStream out = fs.create(new Path(HDFS_PATH + INPUT_PATH + "/text"));
			String text = "product_no    lac_id  moment            start_time          user_id  county_id  staytime  city_id\n"
						+ "13429100031     22554   8       2013-03-11 08:55:19.151754088   571     571     282     571\n"
						+ "13429100082     22540   8       2013-03-11 08:58:20.152622488   571     571     270     571\n"
						+ "13429100082     22691   8       2013-03-11 08:56:37.149593624   571     571     103     571\n"
						+ "13429100087     22705   8       2013-03-11 08:56:51.139539816   571     571     220     571\n"
						+ "13429100087     22540   8       2013-03-11 08:55:45.150276800   571     571     66      571\n"
						+ "13429100082     22540   8       2013-03-11 08:55:38.140225200   571     571     133     571\n"
						+ "13429100140     26642   9       2013-03-11 09:02:19.151754088   571     571     18      571\n"
						+ "13429100082     22691   8       2013-03-11 08:57:32.151754088   571     571     287     571\n"
						+ "13429100189     22558   8       2013-03-11 08:56:24.139539816   571     571     48      571\n"
						+ "13429100349     22503   8       2013-03-11 08:54:30.152622440   571     571     211     571";
			out.write(text.getBytes());
			out.close();
			
			Job job = new Job(new Configuration(), "HadoopTest1");			
			job.setJarByClass(HadoopTest1.class);
			
			job.setMapperClass(MyMapper.class);
			job.setReducerClass(MyReducer.class);
			
			job.setMapOutputKeyClass(Text.class);
			job.setMapOutputValueClass(Text.class);
			
			job.setOutputKeyClass(NullWritable.class);
			job.setOutputValueClass(Text.class);
					
			if (fs.exists(new Path(HDFS_PATH + OUTT_PATH))) //删除已有的输出文件
			{
				fs.delete(new Path(HDFS_PATH + OUTT_PATH), true);
			}

			TextInputFormat.addInputPath(job, new Path(HDFS_PATH + INPUT_PATH));
			FileOutputFormat.setOutputPath(job, new Path(HDFS_PATH + OUTT_PATH));
			
			job.waitForCompletion(true);
			
		}
		catch (URISyntaxException e)
		{
			e.printStackTrace();
		}
		catch (IOException e)
		{
			e.printStackTrace();
		}
		catch (ClassNotFoundException e)
		{
			e.printStackTrace();
		}
		catch (InterruptedException e)
		{
			e.printStackTrace();
		}
	}
}

最后的输出结果：

13429100031     22554   8       2013-03-11 08:55:19.151754088   571     571     282     571
13429100082     22540   8       2013-03-11 08:58:20.152622488   571     571     270     571
13429100082     22691   8       2013-03-11 08:57:32.151754088   571     571     287     571
13429100082     22691   8       2013-03-11 08:56:37.149593624   571     571     103     571
13429100082     22540   8       2013-03-11 08:55:38.140225200   571     571     133     571
13429100087     22705   8       2013-03-11 08:56:51.139539816   571     571     220     571
13429100087     22540   8       2013-03-11 08:55:45.150276800   571     571     66      571
13429100140     26642   9       2013-03-11 09:02:19.151754088   571     571     18      571
13429100189     22558   8       2013-03-11 08:56:24.139539816   571     571     48      571
13429100349     22503   8       2013-03-11 08:54:30.152622440   571     571     211     571

如有不对的地方，还请大家指教。

推荐文章：

一道hadoop面试题，hadoop面试题