Hadoop 单表关联 多表关联,hadoop单表关联
Hadoop 单表关联 多表关联,hadoop单表关联
在单表关联和多表关联的应用的时候,需要从数据里面挖掘出信息来进行操作。
例子给出 孩子和父母的表 输出孩子和爷爷的表 数据如下:
child | parent |
tom | jerem |
tom | lucy |
jerem | xd |
lucy | hongton |
jack | jc |
terry | jc |
jc | cy |
tom | xd |
tom | hongton |
jack | cy |
terry | cy |
首先在map阶段将输入数据分割成child和parent之后,需要将parent设为key,child设为value作为左表,而在另一组的child和parent中需要将parent 作为value child作为key,此表作为右表。为了区分左右表,需要将在输出value的时候 附带上左右的区别信息,字符1代表左表 字符2代表右表。map阶段就完成了左表 和右表 ,在shuffle阶段完成链接,reduce接收链接结果,其中key对应的value-list 包含了 对应的关系,取出value值进行分析,将child放入数组1 parent放入数组2 然后进行数组的笛卡尔积得出最后结果。
代码样式入下:
package reverseIndex; import java.io.IOException; import java.util.Iterator; import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class singleJoin { public static int time = 0; // 用于表头的输出 public static class Map extends Mapper<LongWritable, Text,Text,Text>{ public void map(LongWritable key,Text value,Context context) throws IOException, InterruptedException{ String childName = new String(); String parentName = new String(); String relationType = new String(); // the flag of left_table right_table //handle line data StringTokenizer itr = new StringTokenizer(value.toString()); String[] values = new String[2]; int i = 0; while(itr.hasMoreTokens()){ values[i] = itr.nextToken(); i++; } //ignore the title if(values[0].compareTo("child")!=0){ childName = values[0]; parentName = values[1]; //output the left_table relationType = "1"; context.write(new Text(values[1]), new Text(relationType+"+" +childName+"+"+parentName)); //output the right_table relationType = "2"; context.write(new Text(values[0]),new Text(relationType+"+" +childName+"+"+parentName)); } } } public static class Reduce extends Reducer<Text,Text,Text,Text>{ public void reduce(Text key,Iterable<Text>value,Context context) throws IOException, InterruptedException{ if(0 == time){ //输出表头 context.write(new Text("grandchild"),new Text("grandparent")); time++; } int grandchildnum = 0; int grandparentnum = 0; //定义存储grandchild,grandparent的数组 String[] grandchild = new String[20]; String[] grandparent = new String[20]; Iterator<Text> it = value.iterator(); while(it.hasNext()){ String record = it.next().toString(); int len = record.length(); int i=2; if(0==len){ continue; } char relationType = record.charAt(0); String childName = new String(); String parentName = new String(); while(record.charAt(i)!='+'){ childName += record.charAt(i); i++; } i += 1; while(i<len){ parentName += record.charAt(i); i++; } if('1'==relationType){ grandchild[grandchildnum] = childName; grandchildnum++; } if('2'==relationType){ grandparent[grandparentnum] = parentName; grandparentnum++; } }//end while 结束一行数据 if(0!=grandchildnum && 0!=grandparentnum){ for(int m=0;m<grandchildnum;m++){ for(int n=0;n<grandparentnum;n++){ context.write(new Text(grandchild[m]),new Text(grandparent[m])); } } } }//end reduce }//end Reduce public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { // TODO Auto-generated method stub Configuration conf = new Configuration(); Job job = new Job(conf,"singleJoin"); job.setJarByClass(singleJoin.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job,new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); System.exit(job.waitForCompletion(true)?0:1); } }
多表链接:输入2个文件 ,一个学生表 包含学生名 和课程编号,另一个是教师名和课程编号 ,输出学生相对应的教师。
1 | cy |
2 | ld |
3 | cy |
4 | ld |
xd | 1 |
xyy | 2 |
lf | 3 |
zz | 3 |
xd | cy |
xyy | ld |
lf | cy |
zz | cy |
解决方法与上面一致:(左右表)
代码如下:
package reverseIndex; import java.io.IOException; import java.util.Iterator; import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class multiJoin { public static int time = 0; public static class Map extends Mapper<LongWritable,Text,Text,Text>{ public void map(LongWritable key,Text value,Context context) throws IOException, InterruptedException{ String mapKey = new String(); String mapValue = new String(); String line = value.toString(); String relationType = new String(); StringTokenizer it = new StringTokenizer(line); if(line.contains("studentName")||line.contains("courseId")){ return; } int i=0; while(it.hasMoreTokens()){ String token = it.nextToken(); //如果是课程编号在前,就将作为右表,若是人名在前面就是作为左表 if(token.charAt(0)>='0'&&token.charAt(0)<='9'){ mapKey = token; if(i>0){ relationType = "1"; }else{ relationType = "2"; } continue; } mapValue += token+" "; i++; } //完后左右表的输出 context.write(new Text(mapKey),new Text(relationType+"+"+mapValue)); } } public static class Reduce extends Reducer<Text,Text,Text,Text>{ public void reduce(Text key,Iterable<Text>values,Context context) throws IOException, InterruptedException{ if(time == 0){ context.write(new Text("student"),new Text("teacher")); time++; } int studentnum = 0; int teachernum = 0; String[] student = new String[10]; String[] teacher = new String[10]; Iterator<Text> it = values.iterator(); while(it.hasNext()){ String record = it.next().toString(); int len = record.length(); int i=2;//忽略relationType和+号 if(0 == len){ return; } char relationType = record.charAt(0); if('1' == relationType){ student[studentnum] = record.substring(i); studentnum++; } if('2'==relationType){ teacher[teachernum] = record.substring(i); teachernum++; } } if(0!=studentnum && 0!=teachernum){ for(int m=0;m<studentnum;m++){ for(int n=0;n<teachernum;n++){ context.write(new Text(student[m]), new Text(teacher[n])); } } } } } public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { // TODO Auto-generated method stub Configuration conf = new Configuration(); Job job = new Job(conf,"multiJoin"); job.setJarByClass(multiJoin.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job,new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); System.exit(job.waitForCompletion(true)?0:1); } }
呵呵,你自己都说了啊,第一个是关联的两个表,第二个就插入一个表。
你的表数据量大,关联会很慢,不过感觉应该也不会有这么长时间吧。。。
你这样,用查询分析器,,分析一下语句效率,看看什么地方构成表扫描了。表扫描会使效率很低。
第二种也没有什么弊端,不过效率当然不是最好的。
您好,请将字段写全好吗?
你写成这样简单,怎么可能知道怎么做。。
在这里我只写两个的用餐人数哈,大概的方法就是:
select 日期,sum(早餐) as 早餐用餐人数,sum(午餐) as 午餐用餐人数,sum(晚餐) as 晚餐用餐人数, from 用餐表 where 日期>=第一个日期 and 日期<=第二个日期;
大概的做法就是这样啦~如果要统计早餐到晚餐
评论暂时关闭