1 | /** |
---|
2 | * Licensed to the Apache Software Foundation (ASF) under one |
---|
3 | * or more contributor license agreements. See the NOTICE file |
---|
4 | * distributed with this work for additional information |
---|
5 | * regarding copyright ownership. The ASF licenses this file |
---|
6 | * to you under the Apache License, Version 2.0 (the |
---|
7 | * "License"); you may not use this file except in compliance |
---|
8 | * with the License. You may obtain a copy of the License at |
---|
9 | * |
---|
10 | * http://www.apache.org/licenses/LICENSE-2.0 |
---|
11 | * |
---|
12 | * Unless required by applicable law or agreed to in writing, software |
---|
13 | * distributed under the License is distributed on an "AS IS" BASIS, |
---|
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
---|
15 | * See the License for the specific language governing permissions and |
---|
16 | * limitations under the License. |
---|
17 | */ |
---|
18 | |
---|
19 | package org.apache.hadoop.mapred.join; |
---|
20 | |
---|
21 | import java.io.IOException; |
---|
22 | import java.util.ArrayList; |
---|
23 | import java.util.Map; |
---|
24 | import java.util.regex.Matcher; |
---|
25 | import java.util.regex.Pattern; |
---|
26 | |
---|
27 | import org.apache.hadoop.fs.Path; |
---|
28 | import org.apache.hadoop.io.WritableComparable; |
---|
29 | import org.apache.hadoop.mapred.InputFormat; |
---|
30 | import org.apache.hadoop.mapred.InputSplit; |
---|
31 | import org.apache.hadoop.mapred.JobConf; |
---|
32 | import org.apache.hadoop.mapred.Reporter; |
---|
33 | |
---|
34 | /** |
---|
35 | * An InputFormat capable of performing joins over a set of data sources sorted |
---|
36 | * and partitioned the same way. |
---|
37 | * @see #setFormat |
---|
38 | * |
---|
39 | * A user may define new join types by setting the property |
---|
40 | * <tt>mapred.join.define.<ident></tt> to a classname. In the expression |
---|
41 | * <tt>mapred.join.expr</tt>, the identifier will be assumed to be a |
---|
42 | * ComposableRecordReader. |
---|
43 | * <tt>mapred.join.keycomparator</tt> can be a classname used to compare keys |
---|
44 | * in the join. |
---|
45 | * @see JoinRecordReader |
---|
46 | * @see MultiFilterRecordReader |
---|
47 | */ |
---|
48 | public class CompositeInputFormat<K extends WritableComparable> |
---|
49 | implements ComposableInputFormat<K,TupleWritable> { |
---|
50 | |
---|
51 | // expression parse tree to which IF requests are proxied |
---|
52 | private Parser.Node root; |
---|
53 | |
---|
54 | public CompositeInputFormat() { } |
---|
55 | |
---|
56 | |
---|
57 | /** |
---|
58 | * Interpret a given string as a composite expression. |
---|
59 | * {@code |
---|
60 | * func ::= <ident>([<func>,]*<func>) |
---|
61 | * func ::= tbl(<class>,"<path>") |
---|
62 | * class ::= @see java.lang.Class#forName(java.lang.String) |
---|
63 | * path ::= @see org.apache.hadoop.fs.Path#Path(java.lang.String) |
---|
64 | * } |
---|
65 | * Reads expression from the <tt>mapred.join.expr</tt> property and |
---|
66 | * user-supplied join types from <tt>mapred.join.define.<ident></tt> |
---|
67 | * types. Paths supplied to <tt>tbl</tt> are given as input paths to the |
---|
68 | * InputFormat class listed. |
---|
69 | * @see #compose(java.lang.String, java.lang.Class, java.lang.String...) |
---|
70 | */ |
---|
71 | public void setFormat(JobConf job) throws IOException { |
---|
72 | addDefaults(); |
---|
73 | addUserIdentifiers(job); |
---|
74 | root = Parser.parse(job.get("mapred.join.expr", null), job); |
---|
75 | } |
---|
76 | |
---|
77 | /** |
---|
78 | * Adds the default set of identifiers to the parser. |
---|
79 | */ |
---|
80 | protected void addDefaults() { |
---|
81 | try { |
---|
82 | Parser.CNode.addIdentifier("inner", InnerJoinRecordReader.class); |
---|
83 | Parser.CNode.addIdentifier("outer", OuterJoinRecordReader.class); |
---|
84 | Parser.CNode.addIdentifier("override", OverrideRecordReader.class); |
---|
85 | Parser.WNode.addIdentifier("tbl", WrappedRecordReader.class); |
---|
86 | } catch (NoSuchMethodException e) { |
---|
87 | throw new RuntimeException("FATAL: Failed to init defaults", e); |
---|
88 | } |
---|
89 | } |
---|
90 | |
---|
91 | /** |
---|
92 | * Inform the parser of user-defined types. |
---|
93 | */ |
---|
94 | private void addUserIdentifiers(JobConf job) throws IOException { |
---|
95 | Pattern x = Pattern.compile("^mapred\\.join\\.define\\.(\\w+)$"); |
---|
96 | for (Map.Entry<String,String> kv : job) { |
---|
97 | Matcher m = x.matcher(kv.getKey()); |
---|
98 | if (m.matches()) { |
---|
99 | try { |
---|
100 | Parser.CNode.addIdentifier(m.group(1), |
---|
101 | job.getClass(m.group(0), null, ComposableRecordReader.class)); |
---|
102 | } catch (NoSuchMethodException e) { |
---|
103 | throw (IOException)new IOException( |
---|
104 | "Invalid define for " + m.group(1)).initCause(e); |
---|
105 | } |
---|
106 | } |
---|
107 | } |
---|
108 | } |
---|
109 | |
---|
110 | /** |
---|
111 | * Build a CompositeInputSplit from the child InputFormats by assigning the |
---|
112 | * ith split from each child to the ith composite split. |
---|
113 | */ |
---|
114 | public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { |
---|
115 | setFormat(job); |
---|
116 | job.setLong("mapred.min.split.size", Long.MAX_VALUE); |
---|
117 | return root.getSplits(job, numSplits); |
---|
118 | } |
---|
119 | |
---|
120 | /** |
---|
121 | * Construct a CompositeRecordReader for the children of this InputFormat |
---|
122 | * as defined in the init expression. |
---|
123 | * The outermost join need only be composable, not necessarily a composite. |
---|
124 | * Mandating TupleWritable isn't strictly correct. |
---|
125 | */ |
---|
126 | @SuppressWarnings("unchecked") // child types unknown |
---|
127 | public ComposableRecordReader<K,TupleWritable> getRecordReader( |
---|
128 | InputSplit split, JobConf job, Reporter reporter) throws IOException { |
---|
129 | setFormat(job); |
---|
130 | return root.getRecordReader(split, job, reporter); |
---|
131 | } |
---|
132 | |
---|
133 | /** |
---|
134 | * Convenience method for constructing composite formats. |
---|
135 | * Given InputFormat class (inf), path (p) return: |
---|
136 | * {@code tbl(<inf>, <p>) } |
---|
137 | */ |
---|
138 | public static String compose(Class<? extends InputFormat> inf, String path) { |
---|
139 | return compose(inf.getName().intern(), path, new StringBuffer()).toString(); |
---|
140 | } |
---|
141 | |
---|
142 | /** |
---|
143 | * Convenience method for constructing composite formats. |
---|
144 | * Given operation (op), Object class (inf), set of paths (p) return: |
---|
145 | * {@code <op>(tbl(<inf>,<p1>),tbl(<inf>,<p2>),...,tbl(<inf>,<pn>)) } |
---|
146 | */ |
---|
147 | public static String compose(String op, Class<? extends InputFormat> inf, |
---|
148 | String... path) { |
---|
149 | final String infname = inf.getName(); |
---|
150 | StringBuffer ret = new StringBuffer(op + '('); |
---|
151 | for (String p : path) { |
---|
152 | compose(infname, p, ret); |
---|
153 | ret.append(','); |
---|
154 | } |
---|
155 | ret.setCharAt(ret.length() - 1, ')'); |
---|
156 | return ret.toString(); |
---|
157 | } |
---|
158 | |
---|
159 | /** |
---|
160 | * Convenience method for constructing composite formats. |
---|
161 | * Given operation (op), Object class (inf), set of paths (p) return: |
---|
162 | * {@code <op>(tbl(<inf>,<p1>),tbl(<inf>,<p2>),...,tbl(<inf>,<pn>)) } |
---|
163 | */ |
---|
164 | public static String compose(String op, Class<? extends InputFormat> inf, |
---|
165 | Path... path) { |
---|
166 | ArrayList<String> tmp = new ArrayList<String>(path.length); |
---|
167 | for (Path p : path) { |
---|
168 | tmp.add(p.toString()); |
---|
169 | } |
---|
170 | return compose(op, inf, tmp.toArray(new String[0])); |
---|
171 | } |
---|
172 | |
---|
173 | private static StringBuffer compose(String inf, String path, |
---|
174 | StringBuffer sb) { |
---|
175 | sb.append("tbl(" + inf + ",\""); |
---|
176 | sb.append(path); |
---|
177 | sb.append("\")"); |
---|
178 | return sb; |
---|
179 | } |
---|
180 | |
---|
181 | } |
---|