1 | /** |
---|
2 | * Licensed to the Apache Software Foundation (ASF) under one |
---|
3 | * or more contributor license agreements. See the NOTICE file |
---|
4 | * distributed with this work for additional information |
---|
5 | * regarding copyright ownership. The ASF licenses this file |
---|
6 | * to you under the Apache License, Version 2.0 (the |
---|
7 | * "License"); you may not use this file except in compliance |
---|
8 | * with the License. You may obtain a copy of the License at |
---|
9 | * |
---|
10 | * http://www.apache.org/licenses/LICENSE-2.0 |
---|
11 | * |
---|
12 | * Unless required by applicable law or agreed to in writing, software |
---|
13 | * distributed under the License is distributed on an "AS IS" BASIS, |
---|
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
---|
15 | * See the License for the specific language governing permissions and |
---|
16 | * limitations under the License. |
---|
17 | */ |
---|
18 | |
---|
19 | package org.apache.hadoop.record; |
---|
20 | |
---|
21 | import java.io.DataInput; |
---|
22 | import java.io.DataOutput; |
---|
23 | import java.io.IOException; |
---|
24 | import org.apache.hadoop.io.WritableComparator; |
---|
25 | import org.apache.hadoop.io.WritableUtils; |
---|
26 | |
---|
27 | /** |
---|
28 | * Various utility functions for Hadooop record I/O runtime. |
---|
29 | */ |
---|
30 | public class Utils { |
---|
31 | |
---|
32 | /** Cannot create a new instance of Utils */ |
---|
33 | private Utils() { |
---|
34 | } |
---|
35 | |
---|
36 | public static final char[] hexchars = { '0', '1', '2', '3', '4', '5', |
---|
37 | '6', '7', '8', '9', 'A', 'B', |
---|
38 | 'C', 'D', 'E', 'F' }; |
---|
39 | /** |
---|
40 | * |
---|
41 | * @param s |
---|
42 | * @return |
---|
43 | */ |
---|
44 | static String toXMLString(String s) { |
---|
45 | StringBuffer sb = new StringBuffer(); |
---|
46 | for (int idx = 0; idx < s.length(); idx++) { |
---|
47 | char ch = s.charAt(idx); |
---|
48 | if (ch == '<') { |
---|
49 | sb.append("<"); |
---|
50 | } else if (ch == '&') { |
---|
51 | sb.append("&"); |
---|
52 | } else if (ch == '%') { |
---|
53 | sb.append("%0025"); |
---|
54 | } else if (ch < 0x20 || |
---|
55 | (ch > 0xD7FF && ch < 0xE000) || |
---|
56 | (ch > 0xFFFD)) { |
---|
57 | sb.append("%"); |
---|
58 | sb.append(hexchars[(ch & 0xF000) >> 12]); |
---|
59 | sb.append(hexchars[(ch & 0x0F00) >> 8]); |
---|
60 | sb.append(hexchars[(ch & 0x00F0) >> 4]); |
---|
61 | sb.append(hexchars[(ch & 0x000F)]); |
---|
62 | } else { |
---|
63 | sb.append(ch); |
---|
64 | } |
---|
65 | } |
---|
66 | return sb.toString(); |
---|
67 | } |
---|
68 | |
---|
69 | static private int h2c(char ch) { |
---|
70 | if (ch >= '0' && ch <= '9') { |
---|
71 | return ch - '0'; |
---|
72 | } else if (ch >= 'A' && ch <= 'F') { |
---|
73 | return ch - 'A' + 10; |
---|
74 | } else if (ch >= 'a' && ch <= 'f') { |
---|
75 | return ch - 'a' + 10; |
---|
76 | } |
---|
77 | return 0; |
---|
78 | } |
---|
79 | |
---|
80 | /** |
---|
81 | * |
---|
82 | * @param s |
---|
83 | * @return |
---|
84 | */ |
---|
85 | static String fromXMLString(String s) { |
---|
86 | StringBuffer sb = new StringBuffer(); |
---|
87 | for (int idx = 0; idx < s.length();) { |
---|
88 | char ch = s.charAt(idx++); |
---|
89 | if (ch == '%') { |
---|
90 | int ch1 = h2c(s.charAt(idx++)) << 12; |
---|
91 | int ch2 = h2c(s.charAt(idx++)) << 8; |
---|
92 | int ch3 = h2c(s.charAt(idx++)) << 4; |
---|
93 | int ch4 = h2c(s.charAt(idx++)); |
---|
94 | char res = (char)(ch1 | ch2 | ch3 | ch4); |
---|
95 | sb.append(res); |
---|
96 | } else { |
---|
97 | sb.append(ch); |
---|
98 | } |
---|
99 | } |
---|
100 | return sb.toString(); |
---|
101 | } |
---|
102 | |
---|
103 | /** |
---|
104 | * |
---|
105 | * @param s |
---|
106 | * @return |
---|
107 | */ |
---|
108 | static String toCSVString(String s) { |
---|
109 | StringBuffer sb = new StringBuffer(s.length()+1); |
---|
110 | sb.append('\''); |
---|
111 | int len = s.length(); |
---|
112 | for (int i = 0; i < len; i++) { |
---|
113 | char c = s.charAt(i); |
---|
114 | switch(c) { |
---|
115 | case '\0': |
---|
116 | sb.append("%00"); |
---|
117 | break; |
---|
118 | case '\n': |
---|
119 | sb.append("%0A"); |
---|
120 | break; |
---|
121 | case '\r': |
---|
122 | sb.append("%0D"); |
---|
123 | break; |
---|
124 | case ',': |
---|
125 | sb.append("%2C"); |
---|
126 | break; |
---|
127 | case '}': |
---|
128 | sb.append("%7D"); |
---|
129 | break; |
---|
130 | case '%': |
---|
131 | sb.append("%25"); |
---|
132 | break; |
---|
133 | default: |
---|
134 | sb.append(c); |
---|
135 | } |
---|
136 | } |
---|
137 | return sb.toString(); |
---|
138 | } |
---|
139 | |
---|
140 | /** |
---|
141 | * |
---|
142 | * @param s |
---|
143 | * @throws java.io.IOException |
---|
144 | * @return |
---|
145 | */ |
---|
146 | static String fromCSVString(String s) throws IOException { |
---|
147 | if (s.charAt(0) != '\'') { |
---|
148 | throw new IOException("Error deserializing string."); |
---|
149 | } |
---|
150 | int len = s.length(); |
---|
151 | StringBuffer sb = new StringBuffer(len-1); |
---|
152 | for (int i = 1; i < len; i++) { |
---|
153 | char c = s.charAt(i); |
---|
154 | if (c == '%') { |
---|
155 | char ch1 = s.charAt(i+1); |
---|
156 | char ch2 = s.charAt(i+2); |
---|
157 | i += 2; |
---|
158 | if (ch1 == '0' && ch2 == '0') { |
---|
159 | sb.append('\0'); |
---|
160 | } else if (ch1 == '0' && ch2 == 'A') { |
---|
161 | sb.append('\n'); |
---|
162 | } else if (ch1 == '0' && ch2 == 'D') { |
---|
163 | sb.append('\r'); |
---|
164 | } else if (ch1 == '2' && ch2 == 'C') { |
---|
165 | sb.append(','); |
---|
166 | } else if (ch1 == '7' && ch2 == 'D') { |
---|
167 | sb.append('}'); |
---|
168 | } else if (ch1 == '2' && ch2 == '5') { |
---|
169 | sb.append('%'); |
---|
170 | } else { |
---|
171 | throw new IOException("Error deserializing string."); |
---|
172 | } |
---|
173 | } else { |
---|
174 | sb.append(c); |
---|
175 | } |
---|
176 | } |
---|
177 | return sb.toString(); |
---|
178 | } |
---|
179 | |
---|
180 | /** |
---|
181 | * |
---|
182 | * @param s |
---|
183 | * @return |
---|
184 | */ |
---|
185 | static String toXMLBuffer(Buffer s) { |
---|
186 | return s.toString(); |
---|
187 | } |
---|
188 | |
---|
189 | /** |
---|
190 | * |
---|
191 | * @param s |
---|
192 | * @throws java.io.IOException |
---|
193 | * @return |
---|
194 | */ |
---|
195 | static Buffer fromXMLBuffer(String s) |
---|
196 | throws IOException { |
---|
197 | if (s.length() == 0) { return new Buffer(); } |
---|
198 | int blen = s.length()/2; |
---|
199 | byte[] barr = new byte[blen]; |
---|
200 | for (int idx = 0; idx < blen; idx++) { |
---|
201 | char c1 = s.charAt(2*idx); |
---|
202 | char c2 = s.charAt(2*idx+1); |
---|
203 | barr[idx] = (byte)Integer.parseInt(""+c1+c2, 16); |
---|
204 | } |
---|
205 | return new Buffer(barr); |
---|
206 | } |
---|
207 | |
---|
208 | /** |
---|
209 | * |
---|
210 | * @param buf |
---|
211 | * @return |
---|
212 | */ |
---|
213 | static String toCSVBuffer(Buffer buf) { |
---|
214 | StringBuffer sb = new StringBuffer("#"); |
---|
215 | sb.append(buf.toString()); |
---|
216 | return sb.toString(); |
---|
217 | } |
---|
218 | |
---|
219 | /** |
---|
220 | * Converts a CSV-serialized representation of buffer to a new |
---|
221 | * Buffer |
---|
222 | * @param s CSV-serialized representation of buffer |
---|
223 | * @throws java.io.IOException |
---|
224 | * @return Deserialized Buffer |
---|
225 | */ |
---|
226 | static Buffer fromCSVBuffer(String s) |
---|
227 | throws IOException { |
---|
228 | if (s.charAt(0) != '#') { |
---|
229 | throw new IOException("Error deserializing buffer."); |
---|
230 | } |
---|
231 | if (s.length() == 1) { return new Buffer(); } |
---|
232 | int blen = (s.length()-1)/2; |
---|
233 | byte[] barr = new byte[blen]; |
---|
234 | for (int idx = 0; idx < blen; idx++) { |
---|
235 | char c1 = s.charAt(2*idx+1); |
---|
236 | char c2 = s.charAt(2*idx+2); |
---|
237 | barr[idx] = (byte)Integer.parseInt(""+c1+c2, 16); |
---|
238 | } |
---|
239 | return new Buffer(barr); |
---|
240 | } |
---|
241 | |
---|
242 | private static int utf8LenForCodePoint(final int cpt) throws IOException { |
---|
243 | if (cpt >=0 && cpt <= 0x7F) { |
---|
244 | return 1; |
---|
245 | } |
---|
246 | if (cpt >= 0x80 && cpt <= 0x07FF) { |
---|
247 | return 2; |
---|
248 | } |
---|
249 | if ((cpt >= 0x0800 && cpt < 0xD800) || |
---|
250 | (cpt > 0xDFFF && cpt <= 0xFFFD)) { |
---|
251 | return 3; |
---|
252 | } |
---|
253 | if (cpt >= 0x10000 && cpt <= 0x10FFFF) { |
---|
254 | return 4; |
---|
255 | } |
---|
256 | throw new IOException("Illegal Unicode Codepoint "+ |
---|
257 | Integer.toHexString(cpt)+" in string."); |
---|
258 | } |
---|
259 | |
---|
260 | private static final int B10 = Integer.parseInt("10000000", 2); |
---|
261 | private static final int B110 = Integer.parseInt("11000000", 2); |
---|
262 | private static final int B1110 = Integer.parseInt("11100000", 2); |
---|
263 | private static final int B11110 = Integer.parseInt("11110000", 2); |
---|
264 | private static final int B11 = Integer.parseInt("11000000", 2); |
---|
265 | private static final int B111 = Integer.parseInt("11100000", 2); |
---|
266 | private static final int B1111 = Integer.parseInt("11110000", 2); |
---|
267 | private static final int B11111 = Integer.parseInt("11111000", 2); |
---|
268 | |
---|
269 | private static int writeUtf8(int cpt, final byte[] bytes, final int offset) |
---|
270 | throws IOException { |
---|
271 | if (cpt >=0 && cpt <= 0x7F) { |
---|
272 | bytes[offset] = (byte) cpt; |
---|
273 | return 1; |
---|
274 | } |
---|
275 | if (cpt >= 0x80 && cpt <= 0x07FF) { |
---|
276 | bytes[offset+1] = (byte) (B10 | (cpt & 0x3F)); |
---|
277 | cpt = cpt >> 6; |
---|
278 | bytes[offset] = (byte) (B110 | (cpt & 0x1F)); |
---|
279 | return 2; |
---|
280 | } |
---|
281 | if ((cpt >= 0x0800 && cpt < 0xD800) || |
---|
282 | (cpt > 0xDFFF && cpt <= 0xFFFD)) { |
---|
283 | bytes[offset+2] = (byte) (B10 | (cpt & 0x3F)); |
---|
284 | cpt = cpt >> 6; |
---|
285 | bytes[offset+1] = (byte) (B10 | (cpt & 0x3F)); |
---|
286 | cpt = cpt >> 6; |
---|
287 | bytes[offset] = (byte) (B1110 | (cpt & 0x0F)); |
---|
288 | return 3; |
---|
289 | } |
---|
290 | if (cpt >= 0x10000 && cpt <= 0x10FFFF) { |
---|
291 | bytes[offset+3] = (byte) (B10 | (cpt & 0x3F)); |
---|
292 | cpt = cpt >> 6; |
---|
293 | bytes[offset+2] = (byte) (B10 | (cpt & 0x3F)); |
---|
294 | cpt = cpt >> 6; |
---|
295 | bytes[offset+1] = (byte) (B10 | (cpt & 0x3F)); |
---|
296 | cpt = cpt >> 6; |
---|
297 | bytes[offset] = (byte) (B11110 | (cpt & 0x07)); |
---|
298 | return 4; |
---|
299 | } |
---|
300 | throw new IOException("Illegal Unicode Codepoint "+ |
---|
301 | Integer.toHexString(cpt)+" in string."); |
---|
302 | } |
---|
303 | |
---|
304 | static void toBinaryString(final DataOutput out, final String str) |
---|
305 | throws IOException { |
---|
306 | final int strlen = str.length(); |
---|
307 | byte[] bytes = new byte[strlen*4]; // Codepoints expand to 4 bytes max |
---|
308 | int utf8Len = 0; |
---|
309 | int idx = 0; |
---|
310 | while(idx < strlen) { |
---|
311 | final int cpt = str.codePointAt(idx); |
---|
312 | idx += Character.isSupplementaryCodePoint(cpt) ? 2 : 1; |
---|
313 | utf8Len += writeUtf8(cpt, bytes, utf8Len); |
---|
314 | } |
---|
315 | writeVInt(out, utf8Len); |
---|
316 | out.write(bytes, 0, utf8Len); |
---|
317 | } |
---|
318 | |
---|
319 | static boolean isValidCodePoint(int cpt) { |
---|
320 | return !((cpt > 0x10FFFF) || |
---|
321 | (cpt >= 0xD800 && cpt <= 0xDFFF) || |
---|
322 | (cpt >= 0xFFFE && cpt <=0xFFFF)); |
---|
323 | } |
---|
324 | |
---|
325 | private static int utf8ToCodePoint(int b1, int b2, int b3, int b4) { |
---|
326 | int cpt = 0; |
---|
327 | cpt = (((b1 & ~B11111) << 18) | |
---|
328 | ((b2 & ~B11) << 12) | |
---|
329 | ((b3 & ~B11) << 6) | |
---|
330 | (b4 & ~B11)); |
---|
331 | return cpt; |
---|
332 | } |
---|
333 | |
---|
334 | private static int utf8ToCodePoint(int b1, int b2, int b3) { |
---|
335 | int cpt = 0; |
---|
336 | cpt = (((b1 & ~B1111) << 12) | ((b2 & ~B11) << 6) | (b3 & ~B11)); |
---|
337 | return cpt; |
---|
338 | } |
---|
339 | |
---|
340 | private static int utf8ToCodePoint(int b1, int b2) { |
---|
341 | int cpt = 0; |
---|
342 | cpt = (((b1 & ~B111) << 6) | (b2 & ~B11)); |
---|
343 | return cpt; |
---|
344 | } |
---|
345 | |
---|
346 | private static void checkB10(int b) throws IOException { |
---|
347 | if ((b & B11) != B10) { |
---|
348 | throw new IOException("Invalid UTF-8 representation."); |
---|
349 | } |
---|
350 | } |
---|
351 | |
---|
352 | static String fromBinaryString(final DataInput din) throws IOException { |
---|
353 | final int utf8Len = readVInt(din); |
---|
354 | final byte[] bytes = new byte[utf8Len]; |
---|
355 | din.readFully(bytes); |
---|
356 | int len = 0; |
---|
357 | // For the most commmon case, i.e. ascii, numChars = utf8Len |
---|
358 | StringBuilder sb = new StringBuilder(utf8Len); |
---|
359 | while(len < utf8Len) { |
---|
360 | int cpt = 0; |
---|
361 | final int b1 = bytes[len++] & 0xFF; |
---|
362 | if (b1 <= 0x7F) { |
---|
363 | cpt = b1; |
---|
364 | } else if ((b1 & B11111) == B11110) { |
---|
365 | int b2 = bytes[len++] & 0xFF; |
---|
366 | checkB10(b2); |
---|
367 | int b3 = bytes[len++] & 0xFF; |
---|
368 | checkB10(b3); |
---|
369 | int b4 = bytes[len++] & 0xFF; |
---|
370 | checkB10(b4); |
---|
371 | cpt = utf8ToCodePoint(b1, b2, b3, b4); |
---|
372 | } else if ((b1 & B1111) == B1110) { |
---|
373 | int b2 = bytes[len++] & 0xFF; |
---|
374 | checkB10(b2); |
---|
375 | int b3 = bytes[len++] & 0xFF; |
---|
376 | checkB10(b3); |
---|
377 | cpt = utf8ToCodePoint(b1, b2, b3); |
---|
378 | } else if ((b1 & B111) == B110) { |
---|
379 | int b2 = bytes[len++] & 0xFF; |
---|
380 | checkB10(b2); |
---|
381 | cpt = utf8ToCodePoint(b1, b2); |
---|
382 | } else { |
---|
383 | throw new IOException("Invalid UTF-8 byte "+Integer.toHexString(b1)+ |
---|
384 | " at offset "+(len-1)+" in length of "+utf8Len); |
---|
385 | } |
---|
386 | if (!isValidCodePoint(cpt)) { |
---|
387 | throw new IOException("Illegal Unicode Codepoint "+ |
---|
388 | Integer.toHexString(cpt)+" in stream."); |
---|
389 | } |
---|
390 | sb.appendCodePoint(cpt); |
---|
391 | } |
---|
392 | return sb.toString(); |
---|
393 | } |
---|
394 | |
---|
395 | /** Parse a float from a byte array. */ |
---|
396 | public static float readFloat(byte[] bytes, int start) { |
---|
397 | return WritableComparator.readFloat(bytes, start); |
---|
398 | } |
---|
399 | |
---|
400 | /** Parse a double from a byte array. */ |
---|
401 | public static double readDouble(byte[] bytes, int start) { |
---|
402 | return WritableComparator.readDouble(bytes, start); |
---|
403 | } |
---|
404 | |
---|
405 | /** |
---|
406 | * Reads a zero-compressed encoded long from a byte array and returns it. |
---|
407 | * @param bytes byte array with decode long |
---|
408 | * @param start starting index |
---|
409 | * @throws java.io.IOException |
---|
410 | * @return deserialized long |
---|
411 | */ |
---|
412 | public static long readVLong(byte[] bytes, int start) throws IOException { |
---|
413 | return WritableComparator.readVLong(bytes, start); |
---|
414 | } |
---|
415 | |
---|
416 | /** |
---|
417 | * Reads a zero-compressed encoded integer from a byte array and returns it. |
---|
418 | * @param bytes byte array with the encoded integer |
---|
419 | * @param start start index |
---|
420 | * @throws java.io.IOException |
---|
421 | * @return deserialized integer |
---|
422 | */ |
---|
423 | public static int readVInt(byte[] bytes, int start) throws IOException { |
---|
424 | return WritableComparator.readVInt(bytes, start); |
---|
425 | } |
---|
426 | |
---|
427 | /** |
---|
428 | * Reads a zero-compressed encoded long from a stream and return it. |
---|
429 | * @param in input stream |
---|
430 | * @throws java.io.IOException |
---|
431 | * @return deserialized long |
---|
432 | */ |
---|
433 | public static long readVLong(DataInput in) throws IOException { |
---|
434 | return WritableUtils.readVLong(in); |
---|
435 | } |
---|
436 | |
---|
437 | /** |
---|
438 | * Reads a zero-compressed encoded integer from a stream and returns it. |
---|
439 | * @param in input stream |
---|
440 | * @throws java.io.IOException |
---|
441 | * @return deserialized integer |
---|
442 | */ |
---|
443 | public static int readVInt(DataInput in) throws IOException { |
---|
444 | return WritableUtils.readVInt(in); |
---|
445 | } |
---|
446 | |
---|
447 | /** |
---|
448 | * Get the encoded length if an integer is stored in a variable-length format |
---|
449 | * @return the encoded length |
---|
450 | */ |
---|
451 | public static int getVIntSize(long i) { |
---|
452 | return WritableUtils.getVIntSize(i); |
---|
453 | } |
---|
454 | |
---|
455 | /** |
---|
456 | * Serializes a long to a binary stream with zero-compressed encoding. |
---|
457 | * For -112 <= i <= 127, only one byte is used with the actual value. |
---|
458 | * For other values of i, the first byte value indicates whether the |
---|
459 | * long is positive or negative, and the number of bytes that follow. |
---|
460 | * If the first byte value v is between -113 and -120, the following long |
---|
461 | * is positive, with number of bytes that follow are -(v+112). |
---|
462 | * If the first byte value v is between -121 and -128, the following long |
---|
463 | * is negative, with number of bytes that follow are -(v+120). Bytes are |
---|
464 | * stored in the high-non-zero-byte-first order. |
---|
465 | * |
---|
466 | * @param stream Binary output stream |
---|
467 | * @param i Long to be serialized |
---|
468 | * @throws java.io.IOException |
---|
469 | */ |
---|
470 | public static void writeVLong(DataOutput stream, long i) throws IOException { |
---|
471 | WritableUtils.writeVLong(stream, i); |
---|
472 | } |
---|
473 | |
---|
474 | /** |
---|
475 | * Serializes an int to a binary stream with zero-compressed encoding. |
---|
476 | * |
---|
477 | * @param stream Binary output stream |
---|
478 | * @param i int to be serialized |
---|
479 | * @throws java.io.IOException |
---|
480 | */ |
---|
481 | public static void writeVInt(DataOutput stream, int i) throws IOException { |
---|
482 | WritableUtils.writeVInt(stream, i); |
---|
483 | } |
---|
484 | |
---|
485 | /** Lexicographic order of binary data. */ |
---|
486 | public static int compareBytes(byte[] b1, int s1, int l1, |
---|
487 | byte[] b2, int s2, int l2) { |
---|
488 | return WritableComparator.compareBytes(b1, s1, l1, b2, s2, l2); |
---|
489 | } |
---|
490 | } |
---|