Hadoop 单表关联多表关联，hadoop单表关联

文章由LinuxBoy分享于2019-03-27 05:03:29热评（214）

Hadoop 单表关联多表关联，hadoop单表关联

在单表关联和多表关联的应用的时候，需要从数据里面挖掘出信息来进行操作。

例子给出孩子和父母的表输出孩子和爷爷的表数据如下：

亲人表
child	parent
tom	jerem
tom	lucy
jerem	xd
lucy	hongton
jack	jc
terry	jc
jc	cy

样例输出：

tom	xd
tom	hongton
jack	cy
terry	cy

从样例输出，输入可以得出需要进行单表的链接是自己链接自己，链接的是左表的parent列和右表的child列，结果除去爷孙链接即可。

首先在map阶段将输入数据分割成child和parent之后，需要将parent设为key，child设为value作为左表，而在另一组的child和parent中需要将parent 作为value child作为key，此表作为右表。为了区分左右表，需要将在输出value的时候附带上左右的区别信息，字符1代表左表字符2代表右表。map阶段就完成了左表和右表，在shuffle阶段完成链接，reduce接收链接结果，其中key对应的value-list 包含了对应的关系，取出value值进行分析，将child放入数组1 parent放入数组2 然后进行数组的笛卡尔积得出最后结果。

代码样式入下：

package reverseIndex;

import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class singleJoin {
	public static int time = 0;	//	用于表头的输出
	public static class Map extends Mapper<LongWritable, Text,Text,Text>{
		public void map(LongWritable key,Text value,Context context) throws IOException, InterruptedException{
			String childName = new String();
			String parentName = new String();
			String relationType = new String();	// the flag of left_table right_table
			//handle line data
			StringTokenizer itr = new StringTokenizer(value.toString());
			String[] values = new String[2];
			int i = 0;
			while(itr.hasMoreTokens()){
				values[i] = itr.nextToken();
				i++;
			}
			//ignore the title 
			if(values[0].compareTo("child")!=0){
				childName = values[0];
				parentName = values[1];
				//output the left_table
				relationType = "1";
				context.write(new Text(values[1]), new Text(relationType+"+"
								+childName+"+"+parentName));
				
				//output the right_table
				relationType = "2";
				context.write(new Text(values[0]),new Text(relationType+"+"
							+childName+"+"+parentName));
			}
		}
	}
	public static class Reduce extends Reducer<Text,Text,Text,Text>{
		 public void reduce(Text key,Iterable<Text>value,Context context) throws IOException, InterruptedException{
			 if(0 == time){
				 //输出表头 
				 context.write(new Text("grandchild"),new Text("grandparent"));
				 time++;
			 }
			 int grandchildnum = 0;
			 int grandparentnum = 0;
			 //定义存储grandchild,grandparent的数组
			 String[] grandchild = new String[20];
			 String[] grandparent = new String[20];
			 Iterator<Text> it = value.iterator();
			 while(it.hasNext()){
				 String record = it.next().toString();
				 int len = record.length();
				 int i=2;
				 if(0==len){
					 continue;
				 }
				 char relationType = record.charAt(0);
				 String childName = new String();
				 String parentName = new String();
				 while(record.charAt(i)!='+'){
					 childName += record.charAt(i);
					 i++;
				 }
				 i += 1;
				 while(i<len){
					 parentName += record.charAt(i);
					 i++;
				 }
				 if('1'==relationType){
					 grandchild[grandchildnum] = childName;
					 grandchildnum++;
					 
				 }
				 if('2'==relationType){
					 grandparent[grandparentnum] = parentName;
					 grandparentnum++;
				 }
			 }//end while 结束一行数据
				 if(0!=grandchildnum && 0!=grandparentnum){
					 for(int m=0;m<grandchildnum;m++){
						 for(int n=0;n<grandparentnum;n++){
							 context.write(new Text(grandchild[m]),new Text(grandparent[m]));
						 }
					 }
				 }
			 }//end reduce
	}//end Reduce
	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		// TODO Auto-generated method stub
		Configuration conf = new Configuration();
		Job job = new Job(conf,"singleJoin");
		job.setJarByClass(singleJoin.class);
		job.setMapperClass(Map.class);
		job.setReducerClass(Reduce.class);
		
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		
		FileInputFormat.addInputPath(job,new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		System.exit(job.waitForCompletion(true)?0:1);
	}
}

多表链接：输入2个文件，一个学生表包含学生名和课程编号，另一个是教师名和课程编号，输出学生相对应的教师。

教师
1	cy
2	ld
3	cy
4	ld

xd	1
xyy	2
lf	3
zz	3

输出：

xd	cy
xyy	ld
lf	cy
zz	cy

这种多表链接关系类似于关系型数据库的多表之间的查询，链接的join语句。

解决方法与上面一致：（左右表）

代码如下：

package reverseIndex;

import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class multiJoin {
	public static int time = 0;
	public static class Map extends Mapper<LongWritable,Text,Text,Text>{
		public void map(LongWritable key,Text value,Context context) throws IOException, InterruptedException{
			String mapKey = new String();
			String mapValue = new String();
			String line = value.toString();
			String relationType = new String();
			StringTokenizer it = new StringTokenizer(line);
			if(line.contains("studentName")||line.contains("courseId")){
				return;
			}
			int i=0;
			while(it.hasMoreTokens()){
				String token = it.nextToken();
				//如果是课程编号在前，就将作为右表，若是人名在前面就是作为左表
				if(token.charAt(0)>='0'&&token.charAt(0)<='9'){
					mapKey = token;
					if(i>0){
						relationType = "1";
					}else{
						relationType = "2";
					}
					continue;
				}
				mapValue += token+" ";
				i++;
			}
			//完后左右表的输出
			context.write(new Text(mapKey),new Text(relationType+"+"+mapValue));
		}
	}
public static class Reduce extends Reducer<Text,Text,Text,Text>{
	public void reduce(Text key,Iterable<Text>values,Context context) throws IOException, InterruptedException{
		if(time == 0){
			context.write(new Text("student"),new Text("teacher"));
			time++;
		}
		int studentnum = 0;
		int teachernum = 0;
		String[] student = new String[10];
		String[] teacher = new String[10];
		Iterator<Text> it = values.iterator();
		while(it.hasNext()){
			String record = it.next().toString();
			int len = record.length();
			int i=2;//忽略relationType和+号
			if(0 == len){
				return;
			}
			char relationType = record.charAt(0);
			if('1' == relationType){
				student[studentnum] = record.substring(i);
				studentnum++;
			}
			if('2'==relationType){
				teacher[teachernum] = record.substring(i);
				teachernum++;
			}
		}
		if(0!=studentnum && 0!=teachernum){
			for(int m=0;m<studentnum;m++){
				for(int n=0;n<teachernum;n++){
					context.write(new Text(student[m]), new Text(teacher[n]));
				}
			}
		}
		
	}
}
	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		// TODO Auto-generated method stub
		Configuration conf = new Configuration();
		Job job = new Job(conf,"multiJoin");
		job.setJarByClass(multiJoin.class);
		job.setMapperClass(Map.class);
		job.setReducerClass(Reduce.class);
		
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		
		FileInputFormat.addInputPath(job,new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		System.exit(job.waitForCompletion(true)?0:1);
		
	}
}

为何多表关联查询插入进另外张表比单表查询插入后再更新还要慢?

呵呵，你自己都说了啊，第一个是关联的两个表，第二个就插入一个表。
你的表数据量大，关联会很慢，不过感觉应该也不会有这么长时间吧。。。
你这样，用查询分析器，，分析一下语句效率，看看什么地方构成表扫描了。表扫描会使效率很低。
第二种也没有什么弊端，不过效率当然不是最好的。

sql 多表关联查询（之间无关系）

您好，请将字段写全好吗？
你写成这样简单，怎么可能知道怎么做。。

在这里我只写两个的用餐人数哈，大概的方法就是:
select 日期,sum(早餐) as 早餐用餐人数,sum(午餐) as 午餐用餐人数,sum(晚餐) as 晚餐用餐人数, from 用餐表 where 日期>=第一个日期 and 日期<=第二个日期;
大概的做法就是这样啦~如果要统计早餐到晚餐

推荐文章：

Hadoop 单表关联多表关联，hadoop单表关联