[120] | 1 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> |
---|
| 2 | <!--NewPage--> |
---|
| 3 | <HTML> |
---|
| 4 | <HEAD> |
---|
| 5 | <!-- Generated by javadoc (build 1.6.0_07) on Tue Sep 01 20:57:00 UTC 2009 --> |
---|
| 6 | <TITLE> |
---|
| 7 | org.apache.hadoop.record (Hadoop 0.20.1 API) |
---|
| 8 | </TITLE> |
---|
| 9 | |
---|
| 10 | <META NAME="date" CONTENT="2009-09-01"> |
---|
| 11 | |
---|
| 12 | <LINK REL ="stylesheet" TYPE="text/css" HREF="../../../../stylesheet.css" TITLE="Style"> |
---|
| 13 | |
---|
| 14 | <SCRIPT type="text/javascript"> |
---|
| 15 | function windowTitle() |
---|
| 16 | { |
---|
| 17 | if (location.href.indexOf('is-external=true') == -1) { |
---|
| 18 | parent.document.title="org.apache.hadoop.record (Hadoop 0.20.1 API)"; |
---|
| 19 | } |
---|
| 20 | } |
---|
| 21 | </SCRIPT> |
---|
| 22 | <NOSCRIPT> |
---|
| 23 | </NOSCRIPT> |
---|
| 24 | |
---|
| 25 | </HEAD> |
---|
| 26 | |
---|
| 27 | <BODY BGCOLOR="white" onload="windowTitle();"> |
---|
| 28 | <HR> |
---|
| 29 | |
---|
| 30 | |
---|
| 31 | <!-- ========= START OF TOP NAVBAR ======= --> |
---|
| 32 | <A NAME="navbar_top"><!-- --></A> |
---|
| 33 | <A HREF="#skip-navbar_top" title="Skip navigation links"></A> |
---|
| 34 | <TABLE BORDER="0" WIDTH="100%" CELLPADDING="1" CELLSPACING="0" SUMMARY=""> |
---|
| 35 | <TR> |
---|
| 36 | <TD COLSPAN=2 BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> |
---|
| 37 | <A NAME="navbar_top_firstrow"><!-- --></A> |
---|
| 38 | <TABLE BORDER="0" CELLPADDING="0" CELLSPACING="3" SUMMARY=""> |
---|
| 39 | <TR ALIGN="center" VALIGN="top"> |
---|
| 40 | <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="../../../../overview-summary.html"><FONT CLASS="NavBarFont1"><B>Overview</B></FONT></A> </TD> |
---|
| 41 | <TD BGCOLOR="#FFFFFF" CLASS="NavBarCell1Rev"> <FONT CLASS="NavBarFont1Rev"><B>Package</B></FONT> </TD> |
---|
| 42 | <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <FONT CLASS="NavBarFont1">Class</FONT> </TD> |
---|
| 43 | <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="package-use.html"><FONT CLASS="NavBarFont1"><B>Use</B></FONT></A> </TD> |
---|
| 44 | <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="package-tree.html"><FONT CLASS="NavBarFont1"><B>Tree</B></FONT></A> </TD> |
---|
| 45 | <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="../../../../deprecated-list.html"><FONT CLASS="NavBarFont1"><B>Deprecated</B></FONT></A> </TD> |
---|
| 46 | <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="../../../../index-all.html"><FONT CLASS="NavBarFont1"><B>Index</B></FONT></A> </TD> |
---|
| 47 | <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="../../../../help-doc.html"><FONT CLASS="NavBarFont1"><B>Help</B></FONT></A> </TD> |
---|
| 48 | </TR> |
---|
| 49 | </TABLE> |
---|
| 50 | </TD> |
---|
| 51 | <TD ALIGN="right" VALIGN="top" ROWSPAN=3><EM> |
---|
| 52 | </EM> |
---|
| 53 | </TD> |
---|
| 54 | </TR> |
---|
| 55 | |
---|
| 56 | <TR> |
---|
| 57 | <TD BGCOLOR="white" CLASS="NavBarCell2"><FONT SIZE="-2"> |
---|
| 58 | <A HREF="../../../../org/apache/hadoop/net/package-summary.html"><B>PREV PACKAGE</B></A> |
---|
| 59 | <A HREF="../../../../org/apache/hadoop/record/compiler/package-summary.html"><B>NEXT PACKAGE</B></A></FONT></TD> |
---|
| 60 | <TD BGCOLOR="white" CLASS="NavBarCell2"><FONT SIZE="-2"> |
---|
| 61 | <A HREF="../../../../index.html?org/apache/hadoop/record/package-summary.html" target="_top"><B>FRAMES</B></A> |
---|
| 62 | <A HREF="package-summary.html" target="_top"><B>NO FRAMES</B></A> |
---|
| 63 | <SCRIPT type="text/javascript"> |
---|
| 64 | <!-- |
---|
| 65 | if(window==top) { |
---|
| 66 | document.writeln('<A HREF="../../../../allclasses-noframe.html"><B>All Classes</B></A>'); |
---|
| 67 | } |
---|
| 68 | //--> |
---|
| 69 | </SCRIPT> |
---|
| 70 | <NOSCRIPT> |
---|
| 71 | <A HREF="../../../../allclasses-noframe.html"><B>All Classes</B></A> |
---|
| 72 | </NOSCRIPT> |
---|
| 73 | |
---|
| 74 | |
---|
| 75 | </FONT></TD> |
---|
| 76 | </TR> |
---|
| 77 | </TABLE> |
---|
| 78 | <A NAME="skip-navbar_top"></A> |
---|
| 79 | <!-- ========= END OF TOP NAVBAR ========= --> |
---|
| 80 | |
---|
| 81 | <HR> |
---|
| 82 | <H2> |
---|
| 83 | Package org.apache.hadoop.record |
---|
| 84 | </H2> |
---|
| 85 | Hadoop record I/O contains classes and a record description language |
---|
| 86 | translator for simplifying serialization and deserialization of records in a |
---|
| 87 | language-neutral manner. |
---|
| 88 | <P> |
---|
| 89 | <B>See:</B> |
---|
| 90 | <BR> |
---|
| 91 | <A HREF="#package_description"><B>Description</B></A> |
---|
| 92 | <P> |
---|
| 93 | |
---|
| 94 | <TABLE BORDER="1" WIDTH="100%" CELLPADDING="3" CELLSPACING="0" SUMMARY=""> |
---|
| 95 | <TR BGCOLOR="#CCCCFF" CLASS="TableHeadingColor"> |
---|
| 96 | <TH ALIGN="left" COLSPAN="2"><FONT SIZE="+2"> |
---|
| 97 | <B>Interface Summary</B></FONT></TH> |
---|
| 98 | </TR> |
---|
| 99 | <TR BGCOLOR="white" CLASS="TableRowColor"> |
---|
| 100 | <TD WIDTH="15%"><B><A HREF="../../../../org/apache/hadoop/record/Index.html" title="interface in org.apache.hadoop.record">Index</A></B></TD> |
---|
| 101 | <TD>Interface that acts as an iterator for deserializing maps.</TD> |
---|
| 102 | </TR> |
---|
| 103 | <TR BGCOLOR="white" CLASS="TableRowColor"> |
---|
| 104 | <TD WIDTH="15%"><B><A HREF="../../../../org/apache/hadoop/record/RecordInput.html" title="interface in org.apache.hadoop.record">RecordInput</A></B></TD> |
---|
| 105 | <TD>Interface that all the Deserializers have to implement.</TD> |
---|
| 106 | </TR> |
---|
| 107 | <TR BGCOLOR="white" CLASS="TableRowColor"> |
---|
| 108 | <TD WIDTH="15%"><B><A HREF="../../../../org/apache/hadoop/record/RecordOutput.html" title="interface in org.apache.hadoop.record">RecordOutput</A></B></TD> |
---|
| 109 | <TD>Interface that alll the serializers have to implement.</TD> |
---|
| 110 | </TR> |
---|
| 111 | </TABLE> |
---|
| 112 | |
---|
| 113 | |
---|
| 114 | <P> |
---|
| 115 | |
---|
| 116 | <TABLE BORDER="1" WIDTH="100%" CELLPADDING="3" CELLSPACING="0" SUMMARY=""> |
---|
| 117 | <TR BGCOLOR="#CCCCFF" CLASS="TableHeadingColor"> |
---|
| 118 | <TH ALIGN="left" COLSPAN="2"><FONT SIZE="+2"> |
---|
| 119 | <B>Class Summary</B></FONT></TH> |
---|
| 120 | </TR> |
---|
| 121 | <TR BGCOLOR="white" CLASS="TableRowColor"> |
---|
| 122 | <TD WIDTH="15%"><B><A HREF="../../../../org/apache/hadoop/record/BinaryRecordInput.html" title="class in org.apache.hadoop.record">BinaryRecordInput</A></B></TD> |
---|
| 123 | <TD> </TD> |
---|
| 124 | </TR> |
---|
| 125 | <TR BGCOLOR="white" CLASS="TableRowColor"> |
---|
| 126 | <TD WIDTH="15%"><B><A HREF="../../../../org/apache/hadoop/record/BinaryRecordOutput.html" title="class in org.apache.hadoop.record">BinaryRecordOutput</A></B></TD> |
---|
| 127 | <TD> </TD> |
---|
| 128 | </TR> |
---|
| 129 | <TR BGCOLOR="white" CLASS="TableRowColor"> |
---|
| 130 | <TD WIDTH="15%"><B><A HREF="../../../../org/apache/hadoop/record/Buffer.html" title="class in org.apache.hadoop.record">Buffer</A></B></TD> |
---|
| 131 | <TD>A byte sequence that is used as a Java native type for buffer.</TD> |
---|
| 132 | </TR> |
---|
| 133 | <TR BGCOLOR="white" CLASS="TableRowColor"> |
---|
| 134 | <TD WIDTH="15%"><B><A HREF="../../../../org/apache/hadoop/record/CsvRecordInput.html" title="class in org.apache.hadoop.record">CsvRecordInput</A></B></TD> |
---|
| 135 | <TD> </TD> |
---|
| 136 | </TR> |
---|
| 137 | <TR BGCOLOR="white" CLASS="TableRowColor"> |
---|
| 138 | <TD WIDTH="15%"><B><A HREF="../../../../org/apache/hadoop/record/CsvRecordOutput.html" title="class in org.apache.hadoop.record">CsvRecordOutput</A></B></TD> |
---|
| 139 | <TD> </TD> |
---|
| 140 | </TR> |
---|
| 141 | <TR BGCOLOR="white" CLASS="TableRowColor"> |
---|
| 142 | <TD WIDTH="15%"><B><A HREF="../../../../org/apache/hadoop/record/Record.html" title="class in org.apache.hadoop.record">Record</A></B></TD> |
---|
| 143 | <TD>Abstract class that is extended by generated classes.</TD> |
---|
| 144 | </TR> |
---|
| 145 | <TR BGCOLOR="white" CLASS="TableRowColor"> |
---|
| 146 | <TD WIDTH="15%"><B><A HREF="../../../../org/apache/hadoop/record/RecordComparator.html" title="class in org.apache.hadoop.record">RecordComparator</A></B></TD> |
---|
| 147 | <TD>A raw record comparator base class</TD> |
---|
| 148 | </TR> |
---|
| 149 | <TR BGCOLOR="white" CLASS="TableRowColor"> |
---|
| 150 | <TD WIDTH="15%"><B><A HREF="../../../../org/apache/hadoop/record/Utils.html" title="class in org.apache.hadoop.record">Utils</A></B></TD> |
---|
| 151 | <TD>Various utility functions for Hadooop record I/O runtime.</TD> |
---|
| 152 | </TR> |
---|
| 153 | <TR BGCOLOR="white" CLASS="TableRowColor"> |
---|
| 154 | <TD WIDTH="15%"><B><A HREF="../../../../org/apache/hadoop/record/XmlRecordInput.html" title="class in org.apache.hadoop.record">XmlRecordInput</A></B></TD> |
---|
| 155 | <TD>XML Deserializer.</TD> |
---|
| 156 | </TR> |
---|
| 157 | <TR BGCOLOR="white" CLASS="TableRowColor"> |
---|
| 158 | <TD WIDTH="15%"><B><A HREF="../../../../org/apache/hadoop/record/XmlRecordOutput.html" title="class in org.apache.hadoop.record">XmlRecordOutput</A></B></TD> |
---|
| 159 | <TD>XML Serializer.</TD> |
---|
| 160 | </TR> |
---|
| 161 | </TABLE> |
---|
| 162 | |
---|
| 163 | |
---|
| 164 | <P> |
---|
| 165 | <A NAME="package_description"><!-- --></A><H2> |
---|
| 166 | Package org.apache.hadoop.record Description |
---|
| 167 | </H2> |
---|
| 168 | |
---|
| 169 | <P> |
---|
| 170 | Hadoop record I/O contains classes and a record description language |
---|
| 171 | translator for simplifying serialization and deserialization of records in a |
---|
| 172 | language-neutral manner. |
---|
| 173 | |
---|
| 174 | <h2>Introduction</h2> |
---|
| 175 | |
---|
| 176 | Software systems of any significant complexity require mechanisms for data |
---|
| 177 | interchange with the outside world. These interchanges typically involve the |
---|
| 178 | marshaling and unmarshaling of logical units of data to and from data streams |
---|
| 179 | (files, network connections, memory buffers etc.). Applications usually have |
---|
| 180 | some code for serializing and deserializing the data types that they manipulate |
---|
| 181 | embedded in them. The work of serialization has several features that make |
---|
| 182 | automatic code generation for it worthwhile. Given a particular output encoding |
---|
| 183 | (binary, XML, etc.), serialization of primitive types and simple compositions |
---|
| 184 | of primitives (structs, vectors etc.) is a very mechanical task. Manually |
---|
| 185 | written serialization code can be susceptible to bugs especially when records |
---|
| 186 | have a large number of fields or a record definition changes between software |
---|
| 187 | versions. Lastly, it can be very useful for applications written in different |
---|
| 188 | programming languages to be able to share and interchange data. This can be |
---|
| 189 | made a lot easier by describing the data records manipulated by these |
---|
| 190 | applications in a language agnostic manner and using the descriptions to derive |
---|
| 191 | implementations of serialization in multiple target languages. |
---|
| 192 | |
---|
| 193 | This document describes Hadoop Record I/O, a mechanism that is aimed |
---|
| 194 | at |
---|
| 195 | <ul> |
---|
| 196 | <li> enabling the specification of simple serializable data types (records) |
---|
| 197 | <li> enabling the generation of code in multiple target languages for |
---|
| 198 | marshaling and unmarshaling such types |
---|
| 199 | <li> providing target language specific support that will enable application |
---|
| 200 | programmers to incorporate generated code into their applications |
---|
| 201 | </ul> |
---|
| 202 | |
---|
| 203 | The goals of Hadoop Record I/O are similar to those of mechanisms such as XDR, |
---|
| 204 | ASN.1, PADS and ICE. While these systems all include a DDL that enables |
---|
| 205 | the specification of most record types, they differ widely in what else they |
---|
| 206 | focus on. The focus in Hadoop Record I/O is on data marshaling and |
---|
| 207 | multi-lingual support. We take a translator-based approach to serialization. |
---|
| 208 | Hadoop users have to describe their data in a simple data description |
---|
| 209 | language. The Hadoop DDL translator rcc generates code that users |
---|
| 210 | can invoke in order to read/write their data from/to simple stream |
---|
| 211 | abstractions. Next we list explicitly some of the goals and non-goals of |
---|
| 212 | Hadoop Record I/O. |
---|
| 213 | |
---|
| 214 | |
---|
| 215 | <h3>Goals</h3> |
---|
| 216 | |
---|
| 217 | <ul> |
---|
| 218 | <li> Support for commonly used primitive types. Hadoop should include as |
---|
| 219 | primitives commonly used builtin types from programming languages we intend to |
---|
| 220 | support. |
---|
| 221 | |
---|
| 222 | <li> Support for common data compositions (including recursive compositions). |
---|
| 223 | Hadoop should support widely used composite types such as structs and |
---|
| 224 | vectors. |
---|
| 225 | |
---|
| 226 | <li> Code generation in multiple target languages. Hadoop should be capable of |
---|
| 227 | generating serialization code in multiple target languages and should be |
---|
| 228 | easily extensible to new target languages. The initial target languages are |
---|
| 229 | C++ and Java. |
---|
| 230 | |
---|
| 231 | <li> Support for generated target languages. Hadooop should include support |
---|
| 232 | in the form of headers, libraries, packages for supported target languages |
---|
| 233 | that enable easy inclusion and use of generated code in applications. |
---|
| 234 | |
---|
| 235 | <li> Support for multiple output encodings. Candidates include |
---|
| 236 | packed binary, comma-separated text, XML etc. |
---|
| 237 | |
---|
| 238 | <li> Support for specifying record types in a backwards/forwards compatible |
---|
| 239 | manner. This will probably be in the form of support for optional fields in |
---|
| 240 | records. This version of the document does not include a description of the |
---|
| 241 | planned mechanism, we intend to include it in the next iteration. |
---|
| 242 | |
---|
| 243 | </ul> |
---|
| 244 | |
---|
| 245 | <h3>Non-Goals</h3> |
---|
| 246 | |
---|
| 247 | <ul> |
---|
| 248 | <li> Serializing existing arbitrary C++ classes. |
---|
| 249 | <li> Serializing complex data structures such as trees, linked lists etc. |
---|
| 250 | <li> Built-in indexing schemes, compression, or check-sums. |
---|
| 251 | <li> Dynamic construction of objects from an XML schema. |
---|
| 252 | </ul> |
---|
| 253 | |
---|
| 254 | The remainder of this document describes the features of Hadoop record I/O |
---|
| 255 | in more detail. Section 2 describes the data types supported by the system. |
---|
| 256 | Section 3 lays out the DDL syntax with some examples of simple records. |
---|
| 257 | Section 4 describes the process of code generation with rcc. Section 5 |
---|
| 258 | describes target language mappings and support for Hadoop types. We include a |
---|
| 259 | fairly complete description of C++ mappings with intent to include Java and |
---|
| 260 | others in upcoming iterations of this document. The last section talks about |
---|
| 261 | supported output encodings. |
---|
| 262 | |
---|
| 263 | |
---|
| 264 | <h2>Data Types and Streams</h2> |
---|
| 265 | |
---|
| 266 | This section describes the primitive and composite types supported by Hadoop. |
---|
| 267 | We aim to support a set of types that can be used to simply and efficiently |
---|
| 268 | express a wide range of record types in different programming languages. |
---|
| 269 | |
---|
| 270 | <h3>Primitive Types</h3> |
---|
| 271 | |
---|
| 272 | For the most part, the primitive types of Hadoop map directly to primitive |
---|
| 273 | types in high level programming languages. Special cases are the |
---|
| 274 | ustring (a Unicode string) and buffer types, which we believe |
---|
| 275 | find wide use and which are usually implemented in library code and not |
---|
| 276 | available as language built-ins. Hadoop also supplies these via library code |
---|
| 277 | when a target language built-in is not present and there is no widely |
---|
| 278 | adopted "standard" implementation. The complete list of primitive types is: |
---|
| 279 | |
---|
| 280 | <ul> |
---|
| 281 | <li> byte: An 8-bit unsigned integer. |
---|
| 282 | <li> boolean: A boolean value. |
---|
| 283 | <li> int: A 32-bit signed integer. |
---|
| 284 | <li> long: A 64-bit signed integer. |
---|
| 285 | <li> float: A single precision floating point number as described by |
---|
| 286 | IEEE-754. |
---|
| 287 | <li> double: A double precision floating point number as described by |
---|
| 288 | IEEE-754. |
---|
| 289 | <li> ustring: A string consisting of Unicode characters. |
---|
| 290 | <li> buffer: An arbitrary sequence of bytes. |
---|
| 291 | </ul> |
---|
| 292 | |
---|
| 293 | |
---|
| 294 | <h3>Composite Types</h3> |
---|
| 295 | Hadoop supports a small set of composite types that enable the description |
---|
| 296 | of simple aggregate types and containers. A composite type is serialized |
---|
| 297 | by sequentially serializing it constituent elements. The supported |
---|
| 298 | composite types are: |
---|
| 299 | |
---|
| 300 | <ul> |
---|
| 301 | |
---|
| 302 | <li> record: An aggregate type like a C-struct. This is a list of |
---|
| 303 | typed fields that are together considered a single unit of data. A record |
---|
| 304 | is serialized by sequentially serializing its constituent fields. In addition |
---|
| 305 | to serialization a record has comparison operations (equality and less-than) |
---|
| 306 | implemented for it, these are defined as memberwise comparisons. |
---|
| 307 | |
---|
| 308 | <li>vector: A sequence of entries of the same data type, primitive |
---|
| 309 | or composite. |
---|
| 310 | |
---|
| 311 | <li> map: An associative container mapping instances of a key type to |
---|
| 312 | instances of a value type. The key and value types may themselves be primitive |
---|
| 313 | or composite types. |
---|
| 314 | |
---|
| 315 | </ul> |
---|
| 316 | |
---|
| 317 | <h3>Streams</h3> |
---|
| 318 | |
---|
| 319 | Hadoop generates code for serializing and deserializing record types to |
---|
| 320 | abstract streams. For each target language Hadoop defines very simple input |
---|
| 321 | and output stream interfaces. Application writers can usually develop |
---|
| 322 | concrete implementations of these by putting a one method wrapper around |
---|
| 323 | an existing stream implementation. |
---|
| 324 | |
---|
| 325 | |
---|
| 326 | <h2>DDL Syntax and Examples</h2> |
---|
| 327 | |
---|
| 328 | We now describe the syntax of the Hadoop data description language. This is |
---|
| 329 | followed by a few examples of DDL usage. |
---|
| 330 | |
---|
| 331 | <h3>Hadoop DDL Syntax</h3> |
---|
| 332 | |
---|
| 333 | <pre><code> |
---|
| 334 | recfile = *include module *record |
---|
| 335 | include = "include" path |
---|
| 336 | path = (relative-path / absolute-path) |
---|
| 337 | module = "module" module-name |
---|
| 338 | module-name = name *("." name) |
---|
| 339 | record := "class" name "{" 1*(field) "}" |
---|
| 340 | field := type name ";" |
---|
| 341 | name := ALPHA (ALPHA / DIGIT / "_" )* |
---|
| 342 | type := (ptype / ctype) |
---|
| 343 | ptype := ("byte" / "boolean" / "int" | |
---|
| 344 | "long" / "float" / "double" |
---|
| 345 | "ustring" / "buffer") |
---|
| 346 | ctype := (("vector" "<" type ">") / |
---|
| 347 | ("map" "<" type "," type ">" ) ) / name) |
---|
| 348 | </code></pre> |
---|
| 349 | |
---|
| 350 | A DDL file describes one or more record types. It begins with zero or |
---|
| 351 | more include declarations, a single mandatory module declaration |
---|
| 352 | followed by zero or more class declarations. The semantics of each of |
---|
| 353 | these declarations are described below: |
---|
| 354 | |
---|
| 355 | <ul> |
---|
| 356 | |
---|
| 357 | <li>include: An include declaration specifies a DDL file to be |
---|
| 358 | referenced when generating code for types in the current DDL file. Record types |
---|
| 359 | in the current compilation unit may refer to types in all included files. |
---|
| 360 | File inclusion is recursive. An include does not trigger code |
---|
| 361 | generation for the referenced file. |
---|
| 362 | |
---|
| 363 | <li> module: Every Hadoop DDL file must have a single module |
---|
| 364 | declaration that follows the list of includes and precedes all record |
---|
| 365 | declarations. A module declaration identifies a scope within which |
---|
| 366 | the names of all types in the current file are visible. Module names are |
---|
| 367 | mapped to C++ namespaces, Java packages etc. in generated code. |
---|
| 368 | |
---|
| 369 | <li> class: Records types are specified through class |
---|
| 370 | declarations. A class declaration is like a Java class declaration. |
---|
| 371 | It specifies a named record type and a list of fields that constitute records |
---|
| 372 | of the type. Usage is illustrated in the following examples. |
---|
| 373 | |
---|
| 374 | </ul> |
---|
| 375 | |
---|
| 376 | <h3>Examples</h3> |
---|
| 377 | |
---|
| 378 | <ul> |
---|
| 379 | <li>A simple DDL file links.jr with just one record declaration. |
---|
| 380 | <pre><code> |
---|
| 381 | module links { |
---|
| 382 | class Link { |
---|
| 383 | ustring URL; |
---|
| 384 | boolean isRelative; |
---|
| 385 | ustring anchorText; |
---|
| 386 | }; |
---|
| 387 | } |
---|
| 388 | </code></pre> |
---|
| 389 | |
---|
| 390 | <li> A DDL file outlinks.jr which includes another |
---|
| 391 | <pre><code> |
---|
| 392 | include "links.jr" |
---|
| 393 | |
---|
| 394 | module outlinks { |
---|
| 395 | class OutLinks { |
---|
| 396 | ustring baseURL; |
---|
| 397 | vector<links.Link> outLinks; |
---|
| 398 | }; |
---|
| 399 | } |
---|
| 400 | </code></pre> |
---|
| 401 | </ul> |
---|
| 402 | |
---|
| 403 | <h2>Code Generation</h2> |
---|
| 404 | |
---|
| 405 | The Hadoop translator is written in Java. Invocation is done by executing a |
---|
| 406 | wrapper shell script named named rcc. It takes a list of |
---|
| 407 | record description files as a mandatory argument and an |
---|
| 408 | optional language argument (the default is Java) --language or |
---|
| 409 | -l. Thus a typical invocation would look like: |
---|
| 410 | <pre><code> |
---|
| 411 | $ rcc -l C++ <filename> ... |
---|
| 412 | </code></pre> |
---|
| 413 | |
---|
| 414 | |
---|
| 415 | <h2>Target Language Mappings and Support</h2> |
---|
| 416 | |
---|
| 417 | For all target languages, the unit of code generation is a record type. |
---|
| 418 | For each record type, Hadoop generates code for serialization and |
---|
| 419 | deserialization, record comparison and access to record members. |
---|
| 420 | |
---|
| 421 | <h3>C++</h3> |
---|
| 422 | |
---|
| 423 | Support for including Hadoop generated C++ code in applications comes in the |
---|
| 424 | form of a header file recordio.hh which needs to be included in source |
---|
| 425 | that uses Hadoop types and a library librecordio.a which applications need |
---|
| 426 | to be linked with. The header declares the Hadoop C++ namespace which defines |
---|
| 427 | appropriate types for the various primitives, the basic interfaces for |
---|
| 428 | records and streams and enumerates the supported serialization encodings. |
---|
| 429 | Declarations of these interfaces and a description of their semantics follow: |
---|
| 430 | |
---|
| 431 | <pre><code> |
---|
| 432 | namespace hadoop { |
---|
| 433 | |
---|
| 434 | enum RecFormat { kBinary, kXML, kCSV }; |
---|
| 435 | |
---|
| 436 | class InStream { |
---|
| 437 | public: |
---|
| 438 | virtual ssize_t read(void *buf, size_t n) = 0; |
---|
| 439 | }; |
---|
| 440 | |
---|
| 441 | class OutStream { |
---|
| 442 | public: |
---|
| 443 | virtual ssize_t write(const void *buf, size_t n) = 0; |
---|
| 444 | }; |
---|
| 445 | |
---|
| 446 | class IOError : public runtime_error { |
---|
| 447 | public: |
---|
| 448 | explicit IOError(const std::string& msg); |
---|
| 449 | }; |
---|
| 450 | |
---|
| 451 | class IArchive; |
---|
| 452 | class OArchive; |
---|
| 453 | |
---|
| 454 | class RecordReader { |
---|
| 455 | public: |
---|
| 456 | RecordReader(InStream& in, RecFormat fmt); |
---|
| 457 | virtual ~RecordReader(void); |
---|
| 458 | |
---|
| 459 | virtual void read(Record& rec); |
---|
| 460 | }; |
---|
| 461 | |
---|
| 462 | class RecordWriter { |
---|
| 463 | public: |
---|
| 464 | RecordWriter(OutStream& out, RecFormat fmt); |
---|
| 465 | virtual ~RecordWriter(void); |
---|
| 466 | |
---|
| 467 | virtual void write(Record& rec); |
---|
| 468 | }; |
---|
| 469 | |
---|
| 470 | |
---|
| 471 | class Record { |
---|
| 472 | public: |
---|
| 473 | virtual std::string type(void) const = 0; |
---|
| 474 | virtual std::string signature(void) const = 0; |
---|
| 475 | protected: |
---|
| 476 | virtual bool validate(void) const = 0; |
---|
| 477 | |
---|
| 478 | virtual void |
---|
| 479 | serialize(OArchive& oa, const std::string& tag) const = 0; |
---|
| 480 | |
---|
| 481 | virtual void |
---|
| 482 | deserialize(IArchive& ia, const std::string& tag) = 0; |
---|
| 483 | }; |
---|
| 484 | } |
---|
| 485 | </code></pre> |
---|
| 486 | |
---|
| 487 | <ul> |
---|
| 488 | |
---|
| 489 | <li> RecFormat: An enumeration of the serialization encodings supported |
---|
| 490 | by this implementation of Hadoop. |
---|
| 491 | |
---|
| 492 | <li> InStream: A simple abstraction for an input stream. This has a |
---|
| 493 | single public read method that reads n bytes from the stream into |
---|
| 494 | the buffer buf. Has the same semantics as a blocking read system |
---|
| 495 | call. Returns the number of bytes read or -1 if an error occurs. |
---|
| 496 | |
---|
| 497 | <li> OutStream: A simple abstraction for an output stream. This has a |
---|
| 498 | single write method that writes n bytes to the stream from the |
---|
| 499 | buffer buf. Has the same semantics as a blocking write system |
---|
| 500 | call. Returns the number of bytes written or -1 if an error occurs. |
---|
| 501 | |
---|
| 502 | <li> RecordReader: A RecordReader reads records one at a time from |
---|
| 503 | an underlying stream in a specified record format. The reader is instantiated |
---|
| 504 | with a stream and a serialization format. It has a read method that |
---|
| 505 | takes an instance of a record and deserializes the record from the stream. |
---|
| 506 | |
---|
| 507 | <li> RecordWriter: A RecordWriter writes records one at a |
---|
| 508 | time to an underlying stream in a specified record format. The writer is |
---|
| 509 | instantiated with a stream and a serialization format. It has a |
---|
| 510 | write method that takes an instance of a record and serializes the |
---|
| 511 | record to the stream. |
---|
| 512 | |
---|
| 513 | <li> Record: The base class for all generated record types. This has two |
---|
| 514 | public methods type and signature that return the typename and the |
---|
| 515 | type signature of the record. |
---|
| 516 | |
---|
| 517 | </ul> |
---|
| 518 | |
---|
| 519 | Two files are generated for each record file (note: not for each record). If a |
---|
| 520 | record file is named "name.jr", the generated files are |
---|
| 521 | "name.jr.cc" and "name.jr.hh" containing serialization |
---|
| 522 | implementations and record type declarations respectively. |
---|
| 523 | |
---|
| 524 | For each record in the DDL file, the generated header file will contain a |
---|
| 525 | class definition corresponding to the record type, method definitions for the |
---|
| 526 | generated type will be present in the '.cc' file. The generated class will |
---|
| 527 | inherit from the abstract class hadoop::Record. The DDL files |
---|
| 528 | module declaration determines the namespace the record belongs to. |
---|
| 529 | Each '.' delimited token in the module declaration results in the |
---|
| 530 | creation of a namespace. For instance, the declaration module docs.links |
---|
| 531 | results in the creation of a docs namespace and a nested |
---|
| 532 | docs::links namespace. In the preceding examples, the Link class |
---|
| 533 | is placed in the links namespace. The header file corresponding to |
---|
| 534 | the links.jr file will contain: |
---|
| 535 | |
---|
| 536 | <pre><code> |
---|
| 537 | namespace links { |
---|
| 538 | class Link : public hadoop::Record { |
---|
| 539 | // .... |
---|
| 540 | }; |
---|
| 541 | }; |
---|
| 542 | </code></pre> |
---|
| 543 | |
---|
| 544 | Each field within the record will cause the generation of a private member |
---|
| 545 | declaration of the appropriate type in the class declaration, and one or more |
---|
| 546 | acccessor methods. The generated class will implement the serialize and |
---|
| 547 | deserialize methods defined in hadoop::Record+. It will also |
---|
| 548 | implement the inspection methods type and signature from |
---|
| 549 | hadoop::Record. A default constructor and virtual destructor will also |
---|
| 550 | be generated. Serialization code will read/write records into streams that |
---|
| 551 | implement the hadoop::InStream and the hadoop::OutStream interfaces. |
---|
| 552 | |
---|
| 553 | For each member of a record an accessor method is generated that returns |
---|
| 554 | either the member or a reference to the member. For members that are returned |
---|
| 555 | by value, a setter method is also generated. This is true for primitive |
---|
| 556 | data members of the types byte, int, long, boolean, float and |
---|
| 557 | double. For example, for a int field called MyField the folowing |
---|
| 558 | code is generated. |
---|
| 559 | |
---|
| 560 | <pre><code> |
---|
| 561 | ... |
---|
| 562 | private: |
---|
| 563 | int32_t mMyField; |
---|
| 564 | ... |
---|
| 565 | public: |
---|
| 566 | int32_t getMyField(void) const { |
---|
| 567 | return mMyField; |
---|
| 568 | }; |
---|
| 569 | |
---|
| 570 | void setMyField(int32_t m) { |
---|
| 571 | mMyField = m; |
---|
| 572 | }; |
---|
| 573 | ... |
---|
| 574 | </code></pre> |
---|
| 575 | |
---|
| 576 | For a ustring or buffer or composite field. The generated code |
---|
| 577 | only contains accessors that return a reference to the field. A const |
---|
| 578 | and a non-const accessor are generated. For example: |
---|
| 579 | |
---|
| 580 | <pre><code> |
---|
| 581 | ... |
---|
| 582 | private: |
---|
| 583 | std::string mMyBuf; |
---|
| 584 | ... |
---|
| 585 | public: |
---|
| 586 | |
---|
| 587 | std::string& getMyBuf() { |
---|
| 588 | return mMyBuf; |
---|
| 589 | }; |
---|
| 590 | |
---|
| 591 | const std::string& getMyBuf() const { |
---|
| 592 | return mMyBuf; |
---|
| 593 | }; |
---|
| 594 | ... |
---|
| 595 | </code></pre> |
---|
| 596 | |
---|
| 597 | <h4>Examples</h4> |
---|
| 598 | |
---|
| 599 | Suppose the inclrec.jr file contains: |
---|
| 600 | <pre><code> |
---|
| 601 | module inclrec { |
---|
| 602 | class RI { |
---|
| 603 | int I32; |
---|
| 604 | double D; |
---|
| 605 | ustring S; |
---|
| 606 | }; |
---|
| 607 | } |
---|
| 608 | </code></pre> |
---|
| 609 | |
---|
| 610 | and the testrec.jr file contains: |
---|
| 611 | |
---|
| 612 | <pre><code> |
---|
| 613 | include "inclrec.jr" |
---|
| 614 | module testrec { |
---|
| 615 | class R { |
---|
| 616 | vector<float> VF; |
---|
| 617 | RI Rec; |
---|
| 618 | buffer Buf; |
---|
| 619 | }; |
---|
| 620 | } |
---|
| 621 | </code></pre> |
---|
| 622 | |
---|
| 623 | Then the invocation of rcc such as: |
---|
| 624 | <pre><code> |
---|
| 625 | $ rcc -l c++ inclrec.jr testrec.jr |
---|
| 626 | </code></pre> |
---|
| 627 | will result in generation of four files: |
---|
| 628 | inclrec.jr.{cc,hh} and testrec.jr.{cc,hh}. |
---|
| 629 | |
---|
| 630 | The inclrec.jr.hh will contain: |
---|
| 631 | |
---|
| 632 | <pre><code> |
---|
| 633 | #ifndef _INCLREC_JR_HH_ |
---|
| 634 | #define _INCLREC_JR_HH_ |
---|
| 635 | |
---|
| 636 | #include "recordio.hh" |
---|
| 637 | |
---|
| 638 | namespace inclrec { |
---|
| 639 | |
---|
| 640 | class RI : public hadoop::Record { |
---|
| 641 | |
---|
| 642 | private: |
---|
| 643 | |
---|
| 644 | int32_t I32; |
---|
| 645 | double D; |
---|
| 646 | std::string S; |
---|
| 647 | |
---|
| 648 | public: |
---|
| 649 | |
---|
| 650 | RI(void); |
---|
| 651 | virtual ~RI(void); |
---|
| 652 | |
---|
| 653 | virtual bool operator==(const RI& peer) const; |
---|
| 654 | virtual bool operator<(const RI& peer) const; |
---|
| 655 | |
---|
| 656 | virtual int32_t getI32(void) const { return I32; } |
---|
| 657 | virtual void setI32(int32_t v) { I32 = v; } |
---|
| 658 | |
---|
| 659 | virtual double getD(void) const { return D; } |
---|
| 660 | virtual void setD(double v) { D = v; } |
---|
| 661 | |
---|
| 662 | virtual std::string& getS(void) const { return S; } |
---|
| 663 | virtual const std::string& getS(void) const { return S; } |
---|
| 664 | |
---|
| 665 | virtual std::string type(void) const; |
---|
| 666 | virtual std::string signature(void) const; |
---|
| 667 | |
---|
| 668 | protected: |
---|
| 669 | |
---|
| 670 | virtual void serialize(hadoop::OArchive& a) const; |
---|
| 671 | virtual void deserialize(hadoop::IArchive& a); |
---|
| 672 | }; |
---|
| 673 | } // end namespace inclrec |
---|
| 674 | |
---|
| 675 | #endif /* _INCLREC_JR_HH_ */ |
---|
| 676 | |
---|
| 677 | </code></pre> |
---|
| 678 | |
---|
| 679 | The testrec.jr.hh file will contain: |
---|
| 680 | |
---|
| 681 | |
---|
| 682 | <pre><code> |
---|
| 683 | |
---|
| 684 | #ifndef _TESTREC_JR_HH_ |
---|
| 685 | #define _TESTREC_JR_HH_ |
---|
| 686 | |
---|
| 687 | #include "inclrec.jr.hh" |
---|
| 688 | |
---|
| 689 | namespace testrec { |
---|
| 690 | class R : public hadoop::Record { |
---|
| 691 | |
---|
| 692 | private: |
---|
| 693 | |
---|
| 694 | std::vector<float> VF; |
---|
| 695 | inclrec::RI Rec; |
---|
| 696 | std::string Buf; |
---|
| 697 | |
---|
| 698 | public: |
---|
| 699 | |
---|
| 700 | R(void); |
---|
| 701 | virtual ~R(void); |
---|
| 702 | |
---|
| 703 | virtual bool operator==(const R& peer) const; |
---|
| 704 | virtual bool operator<(const R& peer) const; |
---|
| 705 | |
---|
| 706 | virtual std::vector<float>& getVF(void) const; |
---|
| 707 | virtual const std::vector<float>& getVF(void) const; |
---|
| 708 | |
---|
| 709 | virtual std::string& getBuf(void) const ; |
---|
| 710 | virtual const std::string& getBuf(void) const; |
---|
| 711 | |
---|
| 712 | virtual inclrec::RI& getRec(void) const; |
---|
| 713 | virtual const inclrec::RI& getRec(void) const; |
---|
| 714 | |
---|
| 715 | virtual bool serialize(hadoop::OutArchive& a) const; |
---|
| 716 | virtual bool deserialize(hadoop::InArchive& a); |
---|
| 717 | |
---|
| 718 | virtual std::string type(void) const; |
---|
| 719 | virtual std::string signature(void) const; |
---|
| 720 | }; |
---|
| 721 | }; // end namespace testrec |
---|
| 722 | #endif /* _TESTREC_JR_HH_ */ |
---|
| 723 | |
---|
| 724 | </code></pre> |
---|
| 725 | |
---|
| 726 | <h3>Java</h3> |
---|
| 727 | |
---|
| 728 | Code generation for Java is similar to that for C++. A Java class is generated |
---|
| 729 | for each record type with private members corresponding to the fields. Getters |
---|
| 730 | and setters for fields are also generated. Some differences arise in the |
---|
| 731 | way comparison is expressed and in the mapping of modules to packages and |
---|
| 732 | classes to files. For equality testing, an equals method is generated |
---|
| 733 | for each record type. As per Java requirements a hashCode method is also |
---|
| 734 | generated. For comparison a compareTo method is generated for each |
---|
| 735 | record type. This has the semantics as defined by the Java Comparable |
---|
| 736 | interface, that is, the method returns a negative integer, zero, or a positive |
---|
| 737 | integer as the invoked object is less than, equal to, or greater than the |
---|
| 738 | comparison parameter. |
---|
| 739 | |
---|
| 740 | A .java file is generated per record type as opposed to per DDL |
---|
| 741 | file as in C++. The module declaration translates to a Java |
---|
| 742 | package declaration. The module name maps to an identical Java package |
---|
| 743 | name. In addition to this mapping, the DDL compiler creates the appropriate |
---|
| 744 | directory hierarchy for the package and places the generated .java |
---|
| 745 | files in the correct directories. |
---|
| 746 | |
---|
| 747 | <h2>Mapping Summary</h2> |
---|
| 748 | |
---|
| 749 | <pre><code> |
---|
| 750 | DDL Type C++ Type Java Type |
---|
| 751 | |
---|
| 752 | boolean bool boolean |
---|
| 753 | byte int8_t byte |
---|
| 754 | int int32_t int |
---|
| 755 | long int64_t long |
---|
| 756 | float float float |
---|
| 757 | double double double |
---|
| 758 | ustring std::string java.lang.String |
---|
| 759 | buffer std::string org.apache.hadoop.record.Buffer |
---|
| 760 | class type class type class type |
---|
| 761 | vector<type> std::vector<type> java.util.ArrayList<type> |
---|
| 762 | map<type,type> std::map<type,type> java.util.TreeMap<type,type> |
---|
| 763 | </code></pre> |
---|
| 764 | |
---|
| 765 | <h2>Data encodings</h2> |
---|
| 766 | |
---|
| 767 | This section describes the format of the data encodings supported by Hadoop. |
---|
| 768 | Currently, three data encodings are supported, namely binary, CSV and XML. |
---|
| 769 | |
---|
| 770 | <h3>Binary Serialization Format</h3> |
---|
| 771 | |
---|
| 772 | The binary data encoding format is fairly dense. Serialization of composite |
---|
| 773 | types is simply defined as a concatenation of serializations of the constituent |
---|
| 774 | elements (lengths are included in vectors and maps). |
---|
| 775 | |
---|
| 776 | Composite types are serialized as follows: |
---|
| 777 | <ul> |
---|
| 778 | <li> class: Sequence of serialized members. |
---|
| 779 | <li> vector: The number of elements serialized as an int. Followed by a |
---|
| 780 | sequence of serialized elements. |
---|
| 781 | <li> map: The number of key value pairs serialized as an int. Followed |
---|
| 782 | by a sequence of serialized (key,value) pairs. |
---|
| 783 | </ul> |
---|
| 784 | |
---|
| 785 | Serialization of primitives is more interesting, with a zero compression |
---|
| 786 | optimization for integral types and normalization to UTF-8 for strings. |
---|
| 787 | Primitive types are serialized as follows: |
---|
| 788 | |
---|
| 789 | <ul> |
---|
| 790 | <li> byte: Represented by 1 byte, as is. |
---|
| 791 | <li> boolean: Represented by 1-byte (0 or 1) |
---|
| 792 | <li> int/long: Integers and longs are serialized zero compressed. |
---|
| 793 | Represented as 1-byte if -120 <= value < 128. Otherwise, serialized as a |
---|
| 794 | sequence of 2-5 bytes for ints, 2-9 bytes for longs. The first byte represents |
---|
| 795 | the number of trailing bytes, N, as the negative number (-120-N). For example, |
---|
| 796 | the number 1024 (0x400) is represented by the byte sequence 'x86 x04 x00'. |
---|
| 797 | This doesn't help much for 4-byte integers but does a reasonably good job with |
---|
| 798 | longs without bit twiddling. |
---|
| 799 | <li> float/double: Serialized in IEEE 754 single and double precision |
---|
| 800 | format in network byte order. This is the format used by Java. |
---|
| 801 | <li> ustring: Serialized as 4-byte zero compressed length followed by |
---|
| 802 | data encoded as UTF-8. Strings are normalized to UTF-8 regardless of native |
---|
| 803 | language representation. |
---|
| 804 | <li> buffer: Serialized as a 4-byte zero compressed length followed by the |
---|
| 805 | raw bytes in the buffer. |
---|
| 806 | </ul> |
---|
| 807 | |
---|
| 808 | |
---|
| 809 | <h3>CSV Serialization Format</h3> |
---|
| 810 | |
---|
| 811 | The CSV serialization format has a lot more structure than the "standard" |
---|
| 812 | Excel CSV format, but we believe the additional structure is useful because |
---|
| 813 | |
---|
| 814 | <ul> |
---|
| 815 | <li> it makes parsing a lot easier without detracting too much from legibility |
---|
| 816 | <li> the delimiters around composites make it obvious when one is reading a |
---|
| 817 | sequence of Hadoop records |
---|
| 818 | </ul> |
---|
| 819 | |
---|
| 820 | Serialization formats for the various types are detailed in the grammar that |
---|
| 821 | follows. The notable feature of the formats is the use of delimiters for |
---|
| 822 | indicating the certain field types. |
---|
| 823 | |
---|
| 824 | <ul> |
---|
| 825 | <li> A string field begins with a single quote ('). |
---|
| 826 | <li> A buffer field begins with a sharp (#). |
---|
| 827 | <li> A class, vector or map begins with 's{', 'v{' or 'm{' respectively and |
---|
| 828 | ends with '}'. |
---|
| 829 | </ul> |
---|
| 830 | |
---|
| 831 | The CSV format can be described by the following grammar: |
---|
| 832 | |
---|
| 833 | <pre><code> |
---|
| 834 | record = primitive / struct / vector / map |
---|
| 835 | primitive = boolean / int / long / float / double / ustring / buffer |
---|
| 836 | |
---|
| 837 | boolean = "T" / "F" |
---|
| 838 | int = ["-"] 1*DIGIT |
---|
| 839 | long = ";" ["-"] 1*DIGIT |
---|
| 840 | float = ["-"] 1*DIGIT "." 1*DIGIT ["E" / "e" ["-"] 1*DIGIT] |
---|
| 841 | double = ";" ["-"] 1*DIGIT "." 1*DIGIT ["E" / "e" ["-"] 1*DIGIT] |
---|
| 842 | |
---|
| 843 | ustring = "'" *(UTF8 char except NULL, LF, % and , / "%00" / "%0a" / "%25" / "%2c" ) |
---|
| 844 | |
---|
| 845 | buffer = "#" *(BYTE except NULL, LF, % and , / "%00" / "%0a" / "%25" / "%2c" ) |
---|
| 846 | |
---|
| 847 | struct = "s{" record *("," record) "}" |
---|
| 848 | vector = "v{" [record *("," record)] "}" |
---|
| 849 | map = "m{" [*(record "," record)] "}" |
---|
| 850 | </code></pre> |
---|
| 851 | |
---|
| 852 | <h3>XML Serialization Format</h3> |
---|
| 853 | |
---|
| 854 | The XML serialization format is the same used by Apache XML-RPC |
---|
| 855 | (http://ws.apache.org/xmlrpc/types.html). This is an extension of the original |
---|
| 856 | XML-RPC format and adds some additional data types. All record I/O types are |
---|
| 857 | not directly expressible in this format, and access to a DDL is required in |
---|
| 858 | order to convert these to valid types. All types primitive or composite are |
---|
| 859 | represented by <value> elements. The particular XML-RPC type is |
---|
| 860 | indicated by a nested element in the <value> element. The encoding for |
---|
| 861 | records is always UTF-8. Primitive types are serialized as follows: |
---|
| 862 | |
---|
| 863 | <ul> |
---|
| 864 | <li> byte: XML tag <ex:i1>. Values: 1-byte unsigned |
---|
| 865 | integers represented in US-ASCII |
---|
| 866 | <li> boolean: XML tag <boolean>. Values: "0" or "1" |
---|
| 867 | <li> int: XML tags <i4> or <int>. Values: 4-byte |
---|
| 868 | signed integers represented in US-ASCII. |
---|
| 869 | <li> long: XML tag <ex:i8>. Values: 8-byte signed integers |
---|
| 870 | represented in US-ASCII. |
---|
| 871 | <li> float: XML tag <ex:float>. Values: Single precision |
---|
| 872 | floating point numbers represented in US-ASCII. |
---|
| 873 | <li> double: XML tag <double>. Values: Double precision |
---|
| 874 | floating point numbers represented in US-ASCII. |
---|
| 875 | <li> ustring: XML tag <;string>. Values: String values |
---|
| 876 | represented as UTF-8. XML does not permit all Unicode characters in literal |
---|
| 877 | data. In particular, NULLs and control chars are not allowed. Additionally, |
---|
| 878 | XML processors are required to replace carriage returns with line feeds and to |
---|
| 879 | replace CRLF sequences with line feeds. Programming languages that we work |
---|
| 880 | with do not impose these restrictions on string types. To work around these |
---|
| 881 | restrictions, disallowed characters and CRs are percent escaped in strings. |
---|
| 882 | The '%' character is also percent escaped. |
---|
| 883 | <li> buffer: XML tag <string&>. Values: Arbitrary binary |
---|
| 884 | data. Represented as hexBinary, each byte is replaced by its 2-byte |
---|
| 885 | hexadecimal representation. |
---|
| 886 | </ul> |
---|
| 887 | |
---|
| 888 | Composite types are serialized as follows: |
---|
| 889 | |
---|
| 890 | <ul> |
---|
| 891 | <li> class: XML tag <struct>. A struct is a sequence of |
---|
| 892 | <member> elements. Each <member> element has a <name> |
---|
| 893 | element and a <value> element. The <name> is a string that must |
---|
| 894 | match /[a-zA-Z][a-zA-Z0-9_]*/. The value of the member is represented |
---|
| 895 | by a <value> element. |
---|
| 896 | |
---|
| 897 | <li> vector: XML tag <array<. An <array> contains a |
---|
| 898 | single <data> element. The <data> element is a sequence of |
---|
| 899 | <value> elements each of which represents an element of the vector. |
---|
| 900 | |
---|
| 901 | <li> map: XML tag <array>. Same as vector. |
---|
| 902 | |
---|
| 903 | </ul> |
---|
| 904 | |
---|
| 905 | For example: |
---|
| 906 | |
---|
| 907 | <pre><code> |
---|
| 908 | class { |
---|
| 909 | int MY_INT; // value 5 |
---|
| 910 | vector<float> MY_VEC; // values 0.1, -0.89, 2.45e4 |
---|
| 911 | buffer MY_BUF; // value '\00\n\tabc%' |
---|
| 912 | } |
---|
| 913 | </code></pre> |
---|
| 914 | |
---|
| 915 | is serialized as |
---|
| 916 | |
---|
| 917 | <pre><code class="XML"> |
---|
| 918 | <value> |
---|
| 919 | <struct> |
---|
| 920 | <member> |
---|
| 921 | <name>MY_INT</name> |
---|
| 922 | <value><i4>5</i4></value> |
---|
| 923 | </member> |
---|
| 924 | <member> |
---|
| 925 | <name>MY_VEC</name> |
---|
| 926 | <value> |
---|
| 927 | <array> |
---|
| 928 | <data> |
---|
| 929 | <value><ex:float>0.1</ex:float></value> |
---|
| 930 | <value><ex:float>-0.89</ex:float></value> |
---|
| 931 | <value><ex:float>2.45e4</ex:float></value> |
---|
| 932 | </data> |
---|
| 933 | </array> |
---|
| 934 | </value> |
---|
| 935 | </member> |
---|
| 936 | <member> |
---|
| 937 | <name>MY_BUF</name> |
---|
| 938 | <value><string>%00\n\tabc%25</string></value> |
---|
| 939 | </member> |
---|
| 940 | </struct> |
---|
| 941 | </value> |
---|
| 942 | </code></pre> |
---|
| 943 | <P> |
---|
| 944 | |
---|
| 945 | <P> |
---|
| 946 | <DL> |
---|
| 947 | </DL> |
---|
| 948 | <HR> |
---|
| 949 | |
---|
| 950 | |
---|
| 951 | <!-- ======= START OF BOTTOM NAVBAR ====== --> |
---|
| 952 | <A NAME="navbar_bottom"><!-- --></A> |
---|
| 953 | <A HREF="#skip-navbar_bottom" title="Skip navigation links"></A> |
---|
| 954 | <TABLE BORDER="0" WIDTH="100%" CELLPADDING="1" CELLSPACING="0" SUMMARY=""> |
---|
| 955 | <TR> |
---|
| 956 | <TD COLSPAN=2 BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> |
---|
| 957 | <A NAME="navbar_bottom_firstrow"><!-- --></A> |
---|
| 958 | <TABLE BORDER="0" CELLPADDING="0" CELLSPACING="3" SUMMARY=""> |
---|
| 959 | <TR ALIGN="center" VALIGN="top"> |
---|
| 960 | <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="../../../../overview-summary.html"><FONT CLASS="NavBarFont1"><B>Overview</B></FONT></A> </TD> |
---|
| 961 | <TD BGCOLOR="#FFFFFF" CLASS="NavBarCell1Rev"> <FONT CLASS="NavBarFont1Rev"><B>Package</B></FONT> </TD> |
---|
| 962 | <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <FONT CLASS="NavBarFont1">Class</FONT> </TD> |
---|
| 963 | <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="package-use.html"><FONT CLASS="NavBarFont1"><B>Use</B></FONT></A> </TD> |
---|
| 964 | <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="package-tree.html"><FONT CLASS="NavBarFont1"><B>Tree</B></FONT></A> </TD> |
---|
| 965 | <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="../../../../deprecated-list.html"><FONT CLASS="NavBarFont1"><B>Deprecated</B></FONT></A> </TD> |
---|
| 966 | <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="../../../../index-all.html"><FONT CLASS="NavBarFont1"><B>Index</B></FONT></A> </TD> |
---|
| 967 | <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="../../../../help-doc.html"><FONT CLASS="NavBarFont1"><B>Help</B></FONT></A> </TD> |
---|
| 968 | </TR> |
---|
| 969 | </TABLE> |
---|
| 970 | </TD> |
---|
| 971 | <TD ALIGN="right" VALIGN="top" ROWSPAN=3><EM> |
---|
| 972 | </EM> |
---|
| 973 | </TD> |
---|
| 974 | </TR> |
---|
| 975 | |
---|
| 976 | <TR> |
---|
| 977 | <TD BGCOLOR="white" CLASS="NavBarCell2"><FONT SIZE="-2"> |
---|
| 978 | <A HREF="../../../../org/apache/hadoop/net/package-summary.html"><B>PREV PACKAGE</B></A> |
---|
| 979 | <A HREF="../../../../org/apache/hadoop/record/compiler/package-summary.html"><B>NEXT PACKAGE</B></A></FONT></TD> |
---|
| 980 | <TD BGCOLOR="white" CLASS="NavBarCell2"><FONT SIZE="-2"> |
---|
| 981 | <A HREF="../../../../index.html?org/apache/hadoop/record/package-summary.html" target="_top"><B>FRAMES</B></A> |
---|
| 982 | <A HREF="package-summary.html" target="_top"><B>NO FRAMES</B></A> |
---|
| 983 | <SCRIPT type="text/javascript"> |
---|
| 984 | <!-- |
---|
| 985 | if(window==top) { |
---|
| 986 | document.writeln('<A HREF="../../../../allclasses-noframe.html"><B>All Classes</B></A>'); |
---|
| 987 | } |
---|
| 988 | //--> |
---|
| 989 | </SCRIPT> |
---|
| 990 | <NOSCRIPT> |
---|
| 991 | <A HREF="../../../../allclasses-noframe.html"><B>All Classes</B></A> |
---|
| 992 | </NOSCRIPT> |
---|
| 993 | |
---|
| 994 | |
---|
| 995 | </FONT></TD> |
---|
| 996 | </TR> |
---|
| 997 | </TABLE> |
---|
| 998 | <A NAME="skip-navbar_bottom"></A> |
---|
| 999 | <!-- ======== END OF BOTTOM NAVBAR ======= --> |
---|
| 1000 | |
---|
| 1001 | <HR> |
---|
| 1002 | Copyright © 2009 The Apache Software Foundation |
---|
| 1003 | </BODY> |
---|
| 1004 | </HTML> |
---|