source: proiecte/HadoopJUnit/hadoop-0.20.1/docs/api/org/apache/hadoop/record/package-summary.html @ 120

Last change on this file since 120 was 120, checked in by (none), 14 years ago

Added the mail files for the Hadoop JUNit Project

  • Property svn:executable set to *
File size: 36.0 KB
Line 
1<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
2<!--NewPage-->
3<HTML>
4<HEAD>
5<!-- Generated by javadoc (build 1.6.0_07) on Tue Sep 01 20:57:00 UTC 2009 -->
6<TITLE>
7org.apache.hadoop.record (Hadoop 0.20.1 API)
8</TITLE>
9
10<META NAME="date" CONTENT="2009-09-01">
11
12<LINK REL ="stylesheet" TYPE="text/css" HREF="../../../../stylesheet.css" TITLE="Style">
13
14<SCRIPT type="text/javascript">
15function windowTitle()
16{
17    if (location.href.indexOf('is-external=true') == -1) {
18        parent.document.title="org.apache.hadoop.record (Hadoop 0.20.1 API)";
19    }
20}
21</SCRIPT>
22<NOSCRIPT>
23</NOSCRIPT>
24
25</HEAD>
26
27<BODY BGCOLOR="white" onload="windowTitle();">
28<HR>
29
30
31<!-- ========= START OF TOP NAVBAR ======= -->
32<A NAME="navbar_top"><!-- --></A>
33<A HREF="#skip-navbar_top" title="Skip navigation links"></A>
34<TABLE BORDER="0" WIDTH="100%" CELLPADDING="1" CELLSPACING="0" SUMMARY="">
35<TR>
36<TD COLSPAN=2 BGCOLOR="#EEEEFF" CLASS="NavBarCell1">
37<A NAME="navbar_top_firstrow"><!-- --></A>
38<TABLE BORDER="0" CELLPADDING="0" CELLSPACING="3" SUMMARY="">
39  <TR ALIGN="center" VALIGN="top">
40  <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1">    <A HREF="../../../../overview-summary.html"><FONT CLASS="NavBarFont1"><B>Overview</B></FONT></A>&nbsp;</TD>
41  <TD BGCOLOR="#FFFFFF" CLASS="NavBarCell1Rev"> &nbsp;<FONT CLASS="NavBarFont1Rev"><B>Package</B></FONT>&nbsp;</TD>
42  <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1">    <FONT CLASS="NavBarFont1">Class</FONT>&nbsp;</TD>
43  <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1">    <A HREF="package-use.html"><FONT CLASS="NavBarFont1"><B>Use</B></FONT></A>&nbsp;</TD>
44  <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1">    <A HREF="package-tree.html"><FONT CLASS="NavBarFont1"><B>Tree</B></FONT></A>&nbsp;</TD>
45  <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1">    <A HREF="../../../../deprecated-list.html"><FONT CLASS="NavBarFont1"><B>Deprecated</B></FONT></A>&nbsp;</TD>
46  <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1">    <A HREF="../../../../index-all.html"><FONT CLASS="NavBarFont1"><B>Index</B></FONT></A>&nbsp;</TD>
47  <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1">    <A HREF="../../../../help-doc.html"><FONT CLASS="NavBarFont1"><B>Help</B></FONT></A>&nbsp;</TD>
48  </TR>
49</TABLE>
50</TD>
51<TD ALIGN="right" VALIGN="top" ROWSPAN=3><EM>
52</EM>
53</TD>
54</TR>
55
56<TR>
57<TD BGCOLOR="white" CLASS="NavBarCell2"><FONT SIZE="-2">
58&nbsp;<A HREF="../../../../org/apache/hadoop/net/package-summary.html"><B>PREV PACKAGE</B></A>&nbsp;
59&nbsp;<A HREF="../../../../org/apache/hadoop/record/compiler/package-summary.html"><B>NEXT PACKAGE</B></A></FONT></TD>
60<TD BGCOLOR="white" CLASS="NavBarCell2"><FONT SIZE="-2">
61  <A HREF="../../../../index.html?org/apache/hadoop/record/package-summary.html" target="_top"><B>FRAMES</B></A>  &nbsp;
62&nbsp;<A HREF="package-summary.html" target="_top"><B>NO FRAMES</B></A>  &nbsp;
63&nbsp;<SCRIPT type="text/javascript">
64  <!--
65  if(window==top) {
66    document.writeln('<A HREF="../../../../allclasses-noframe.html"><B>All Classes</B></A>');
67  }
68  //-->
69</SCRIPT>
70<NOSCRIPT>
71  <A HREF="../../../../allclasses-noframe.html"><B>All Classes</B></A>
72</NOSCRIPT>
73
74
75</FONT></TD>
76</TR>
77</TABLE>
78<A NAME="skip-navbar_top"></A>
79<!-- ========= END OF TOP NAVBAR ========= -->
80
81<HR>
82<H2>
83Package org.apache.hadoop.record
84</H2>
85Hadoop record I/O contains classes and a record description language
86  translator for simplifying serialization and deserialization of records in a
87  language-neutral manner.
88<P>
89<B>See:</B>
90<BR>
91&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<A HREF="#package_description"><B>Description</B></A>
92<P>
93
94<TABLE BORDER="1" WIDTH="100%" CELLPADDING="3" CELLSPACING="0" SUMMARY="">
95<TR BGCOLOR="#CCCCFF" CLASS="TableHeadingColor">
96<TH ALIGN="left" COLSPAN="2"><FONT SIZE="+2">
97<B>Interface Summary</B></FONT></TH>
98</TR>
99<TR BGCOLOR="white" CLASS="TableRowColor">
100<TD WIDTH="15%"><B><A HREF="../../../../org/apache/hadoop/record/Index.html" title="interface in org.apache.hadoop.record">Index</A></B></TD>
101<TD>Interface that acts as an iterator for deserializing maps.</TD>
102</TR>
103<TR BGCOLOR="white" CLASS="TableRowColor">
104<TD WIDTH="15%"><B><A HREF="../../../../org/apache/hadoop/record/RecordInput.html" title="interface in org.apache.hadoop.record">RecordInput</A></B></TD>
105<TD>Interface that all the Deserializers have to implement.</TD>
106</TR>
107<TR BGCOLOR="white" CLASS="TableRowColor">
108<TD WIDTH="15%"><B><A HREF="../../../../org/apache/hadoop/record/RecordOutput.html" title="interface in org.apache.hadoop.record">RecordOutput</A></B></TD>
109<TD>Interface that alll the serializers have to implement.</TD>
110</TR>
111</TABLE>
112&nbsp;
113
114<P>
115
116<TABLE BORDER="1" WIDTH="100%" CELLPADDING="3" CELLSPACING="0" SUMMARY="">
117<TR BGCOLOR="#CCCCFF" CLASS="TableHeadingColor">
118<TH ALIGN="left" COLSPAN="2"><FONT SIZE="+2">
119<B>Class Summary</B></FONT></TH>
120</TR>
121<TR BGCOLOR="white" CLASS="TableRowColor">
122<TD WIDTH="15%"><B><A HREF="../../../../org/apache/hadoop/record/BinaryRecordInput.html" title="class in org.apache.hadoop.record">BinaryRecordInput</A></B></TD>
123<TD>&nbsp;</TD>
124</TR>
125<TR BGCOLOR="white" CLASS="TableRowColor">
126<TD WIDTH="15%"><B><A HREF="../../../../org/apache/hadoop/record/BinaryRecordOutput.html" title="class in org.apache.hadoop.record">BinaryRecordOutput</A></B></TD>
127<TD>&nbsp;</TD>
128</TR>
129<TR BGCOLOR="white" CLASS="TableRowColor">
130<TD WIDTH="15%"><B><A HREF="../../../../org/apache/hadoop/record/Buffer.html" title="class in org.apache.hadoop.record">Buffer</A></B></TD>
131<TD>A byte sequence that is used as a Java native type for buffer.</TD>
132</TR>
133<TR BGCOLOR="white" CLASS="TableRowColor">
134<TD WIDTH="15%"><B><A HREF="../../../../org/apache/hadoop/record/CsvRecordInput.html" title="class in org.apache.hadoop.record">CsvRecordInput</A></B></TD>
135<TD>&nbsp;</TD>
136</TR>
137<TR BGCOLOR="white" CLASS="TableRowColor">
138<TD WIDTH="15%"><B><A HREF="../../../../org/apache/hadoop/record/CsvRecordOutput.html" title="class in org.apache.hadoop.record">CsvRecordOutput</A></B></TD>
139<TD>&nbsp;</TD>
140</TR>
141<TR BGCOLOR="white" CLASS="TableRowColor">
142<TD WIDTH="15%"><B><A HREF="../../../../org/apache/hadoop/record/Record.html" title="class in org.apache.hadoop.record">Record</A></B></TD>
143<TD>Abstract class that is extended by generated classes.</TD>
144</TR>
145<TR BGCOLOR="white" CLASS="TableRowColor">
146<TD WIDTH="15%"><B><A HREF="../../../../org/apache/hadoop/record/RecordComparator.html" title="class in org.apache.hadoop.record">RecordComparator</A></B></TD>
147<TD>A raw record comparator base class</TD>
148</TR>
149<TR BGCOLOR="white" CLASS="TableRowColor">
150<TD WIDTH="15%"><B><A HREF="../../../../org/apache/hadoop/record/Utils.html" title="class in org.apache.hadoop.record">Utils</A></B></TD>
151<TD>Various utility functions for Hadooop record I/O runtime.</TD>
152</TR>
153<TR BGCOLOR="white" CLASS="TableRowColor">
154<TD WIDTH="15%"><B><A HREF="../../../../org/apache/hadoop/record/XmlRecordInput.html" title="class in org.apache.hadoop.record">XmlRecordInput</A></B></TD>
155<TD>XML Deserializer.</TD>
156</TR>
157<TR BGCOLOR="white" CLASS="TableRowColor">
158<TD WIDTH="15%"><B><A HREF="../../../../org/apache/hadoop/record/XmlRecordOutput.html" title="class in org.apache.hadoop.record">XmlRecordOutput</A></B></TD>
159<TD>XML Serializer.</TD>
160</TR>
161</TABLE>
162&nbsp;
163
164<P>
165<A NAME="package_description"><!-- --></A><H2>
166Package org.apache.hadoop.record Description
167</H2>
168
169<P>
170Hadoop record I/O contains classes and a record description language
171  translator for simplifying serialization and deserialization of records in a
172  language-neutral manner.
173 
174  <h2>Introduction</h2>
175 
176  Software systems of any significant complexity require mechanisms for data
177interchange with the outside world. These interchanges typically involve the
178marshaling and unmarshaling of logical units of data to and from data streams
179(files, network connections, memory buffers etc.). Applications usually have
180some code for serializing and deserializing the data types that they manipulate
181embedded in them. The work of serialization has several features that make
182automatic code generation for it worthwhile. Given a particular output encoding
183(binary, XML, etc.), serialization of primitive types and simple compositions
184of primitives (structs, vectors etc.) is a very mechanical task. Manually
185written serialization code can be susceptible to bugs especially when records
186have a large number of fields or a record definition changes between software
187versions. Lastly, it can be very useful for applications written in different
188programming languages to be able to share and interchange data. This can be
189made a lot easier by describing the data records manipulated by these
190applications in a language agnostic manner and using the descriptions to derive
191implementations of serialization in multiple target languages.
192
193This document describes Hadoop Record I/O, a mechanism that is aimed
194at
195<ul> 
196<li> enabling the specification of simple serializable data types (records)
197<li> enabling the generation of code in multiple target languages for
198marshaling and unmarshaling such types
199<li> providing target language specific support that will enable application
200programmers to incorporate generated code into their applications
201</ul>
202
203The goals of Hadoop Record I/O are similar to those of mechanisms such as XDR,
204ASN.1, PADS and ICE. While these systems all include a DDL that enables
205the specification of most record types, they differ widely in what else they
206focus on. The focus in Hadoop Record I/O is on data marshaling and
207multi-lingual support.  We take a translator-based approach to serialization.
208Hadoop users have to describe their data in a simple data description
209language. The Hadoop DDL translator rcc generates code that users
210can invoke in order to read/write their data from/to simple stream
211abstractions. Next we list explicitly some of the goals and non-goals of
212Hadoop Record I/O.
213
214
215<h3>Goals</h3>
216
217<ul>
218<li> Support for commonly used primitive types. Hadoop should include as
219primitives commonly used builtin types from programming languages we intend to
220support.
221
222<li> Support for common data compositions (including recursive compositions).
223Hadoop should support widely used composite types such as structs and
224vectors.
225
226<li> Code generation in multiple target languages. Hadoop should be capable of
227generating serialization code in multiple target languages and should be
228easily extensible to new target languages. The initial target languages are
229C++ and Java.
230
231<li> Support for generated target languages. Hadooop should include support
232in the form of headers, libraries, packages for supported target languages
233that enable easy inclusion and use of generated code in applications.
234
235<li> Support for multiple output encodings. Candidates include
236packed binary, comma-separated text, XML etc.
237
238<li> Support for specifying record types in a backwards/forwards compatible
239manner. This will probably be in the form of support for optional fields in
240records. This version of the document does not include a description of the
241planned mechanism, we intend to include it in the next iteration.
242
243</ul>
244
245<h3>Non-Goals</h3>
246
247<ul>
248  <li> Serializing existing arbitrary C++ classes.
249  <li> Serializing complex data structures such as trees, linked lists etc.
250  <li> Built-in indexing schemes, compression, or check-sums.
251  <li> Dynamic construction of objects from an XML schema.
252</ul>
253
254The remainder of this document describes the features of Hadoop record I/O
255in more detail. Section 2 describes the data types supported by the system.
256Section 3 lays out the DDL syntax with some examples of simple records.
257Section 4 describes the process of code generation with rcc. Section 5
258describes target language mappings and support for Hadoop types. We include a
259fairly complete description of C++ mappings with intent to include Java and
260others in upcoming iterations of this document. The last section talks about
261supported output encodings.
262
263
264<h2>Data Types and Streams</h2>
265
266This section describes the primitive and composite types supported by Hadoop.
267We aim to support a set of types that can be used to simply and efficiently
268express a wide range of record types in different programming languages.
269
270<h3>Primitive Types</h3>
271
272For the most part, the primitive types of Hadoop map directly to primitive
273types in high level programming languages. Special cases are the
274ustring (a Unicode string) and buffer types, which we believe
275find wide use and which are usually implemented in library code and not
276available as language built-ins. Hadoop also supplies these via library code
277when a target language built-in is not present and there is no widely
278adopted "standard" implementation. The complete list of primitive types is:
279
280<ul>
281  <li> byte: An 8-bit unsigned integer.
282  <li> boolean: A boolean value.
283  <li> int: A 32-bit signed integer.
284  <li> long: A 64-bit signed integer.
285  <li> float: A single precision floating point number as described by
286    IEEE-754.
287  <li> double: A double precision floating point number as described by
288    IEEE-754.
289  <li> ustring: A string consisting of Unicode characters.
290  <li> buffer: An arbitrary sequence of bytes.
291</ul>
292
293
294<h3>Composite Types</h3>
295Hadoop supports a small set of composite types that enable the description
296of simple aggregate types and containers. A composite type is serialized
297by sequentially serializing it constituent elements. The supported
298composite types are:
299
300<ul>
301
302  <li> record: An aggregate type like a C-struct. This is a list of
303typed fields that are together considered a single unit of data. A record
304is serialized by sequentially serializing its constituent fields. In addition
305to serialization a record has comparison operations (equality and less-than)
306implemented for it, these are defined as memberwise comparisons.
307
308  <li>vector: A sequence of entries of the same data type, primitive
309or composite.
310
311  <li> map: An associative container mapping instances of a key type to
312instances of a value type. The key and value types may themselves be primitive
313or composite types.
314
315</ul>
316
317<h3>Streams</h3>
318
319Hadoop generates code for serializing and deserializing record types to
320abstract streams. For each target language Hadoop defines very simple input
321and output stream interfaces. Application writers can usually develop
322concrete implementations of these by putting a one method wrapper around
323an existing stream implementation.
324
325
326<h2>DDL Syntax and Examples</h2>
327
328We now describe the syntax of the Hadoop data description language. This is
329followed by a few examples of DDL usage.
330 
331<h3>Hadoop DDL Syntax</h3>
332
333<pre><code>
334recfile = *include module *record
335include = "include" path
336path = (relative-path / absolute-path)
337module = "module" module-name
338module-name = name *("." name)
339record := "class" name "{" 1*(field) "}"
340field := type name ";"
341name :=  ALPHA (ALPHA / DIGIT / "_" )*
342type := (ptype / ctype)
343ptype := ("byte" / "boolean" / "int" |
344          "long" / "float" / "double"
345          "ustring" / "buffer")
346ctype := (("vector" "<" type ">") /
347          ("map" "<" type "," type ">" ) ) / name)
348</code></pre>
349
350A DDL file describes one or more record types. It begins with zero or
351more include declarations, a single mandatory module declaration
352followed by zero or more class declarations. The semantics of each of
353these declarations are described below:
354
355<ul>
356
357<li>include: An include declaration specifies a DDL file to be
358referenced when generating code for types in the current DDL file. Record types
359in the current compilation unit may refer to types in all included files.
360File inclusion is recursive. An include does not trigger code
361generation for the referenced file.
362
363<li> module: Every Hadoop DDL file must have a single module
364declaration that follows the list of includes and precedes all record
365declarations. A module declaration identifies a scope within which
366the names of all types in the current file are visible. Module names are
367mapped to C++ namespaces, Java packages etc. in generated code.
368
369<li> class: Records types are specified through class
370declarations. A class declaration is like a Java class declaration.
371It specifies a named record type and a list of fields that constitute records
372of the type. Usage is illustrated in the following examples.
373
374</ul>
375
376<h3>Examples</h3>
377
378<ul>
379<li>A simple DDL file links.jr with just one record declaration.
380<pre><code>
381module links {
382    class Link {
383        ustring URL;
384        boolean isRelative;
385        ustring anchorText;
386    };
387}
388</code></pre>
389
390<li> A DDL file outlinks.jr which includes another
391<pre><code>
392include "links.jr"
393
394module outlinks {
395    class OutLinks {
396        ustring baseURL;
397        vector<links.Link> outLinks;
398    };
399}
400</code></pre>
401</ul>
402
403<h2>Code Generation</h2>
404
405The Hadoop translator is written in Java. Invocation is done by executing a
406wrapper shell script named named rcc. It takes a list of
407record description files as a mandatory argument and an
408optional language argument (the default is Java) --language or
409-l. Thus a typical invocation would look like:
410<pre><code>
411$ rcc -l C++ <filename> ...
412</code></pre>
413
414
415<h2>Target Language Mappings and Support</h2>
416
417For all target languages, the unit of code generation is a record type.
418For each record type, Hadoop generates code for serialization and
419deserialization, record comparison and access to record members.
420
421<h3>C++</h3>
422
423Support for including Hadoop generated C++ code in applications comes in the
424form of a header file recordio.hh which needs to be included in source
425that uses Hadoop types and a library librecordio.a which applications need
426to be linked with. The header declares the Hadoop C++ namespace which defines
427appropriate types for the various primitives, the basic interfaces for
428records and streams and enumerates the supported serialization encodings.
429Declarations of these interfaces and a description of their semantics follow:
430
431<pre><code>
432namespace hadoop {
433
434  enum RecFormat { kBinary, kXML, kCSV };
435
436  class InStream {
437  public:
438    virtual ssize_t read(void *buf, size_t n) = 0;
439  };
440
441  class OutStream {
442  public:
443    virtual ssize_t write(const void *buf, size_t n) = 0;
444  };
445
446  class IOError : public runtime_error {
447  public:
448    explicit IOError(const std::string& msg);
449  };
450
451  class IArchive;
452  class OArchive;
453
454  class RecordReader {
455  public:
456    RecordReader(InStream& in, RecFormat fmt);
457    virtual ~RecordReader(void);
458
459    virtual void read(Record& rec);
460  };
461
462  class RecordWriter {
463  public:
464    RecordWriter(OutStream& out, RecFormat fmt);
465    virtual ~RecordWriter(void);
466
467    virtual void write(Record& rec);
468  };
469
470
471  class Record {
472  public:
473    virtual std::string type(void) const = 0;
474    virtual std::string signature(void) const = 0;
475  protected:
476    virtual bool validate(void) const = 0;
477
478    virtual void
479    serialize(OArchive& oa, const std::string& tag) const = 0;
480
481    virtual void
482    deserialize(IArchive& ia, const std::string& tag) = 0;
483  };
484}
485</code></pre>
486
487<ul>
488
489<li> RecFormat: An enumeration of the serialization encodings supported
490by this implementation of Hadoop.
491
492<li> InStream: A simple abstraction for an input stream. This has a
493single public read method that reads n bytes from the stream into
494the buffer buf. Has the same semantics as a blocking read system
495call. Returns the number of bytes read or -1 if an error occurs.
496
497<li> OutStream: A simple abstraction for an output stream. This has a
498single write method that writes n bytes to the stream from the
499buffer buf. Has the same semantics as a blocking write system
500call. Returns the number of bytes written or -1 if an error occurs.
501
502<li> RecordReader: A RecordReader reads records one at a time from
503an underlying stream in a specified record format. The reader is instantiated
504with a stream and a serialization format. It has a read method that
505takes an instance of a record and deserializes the record from the stream.
506
507<li> RecordWriter: A RecordWriter writes records one at a
508time to an underlying stream in a specified record format. The writer is
509instantiated with a stream and a serialization format. It has a
510write method that takes an instance of a record and serializes the
511record to the stream.
512
513<li> Record: The base class for all generated record types. This has two
514public methods type and signature that return the typename and the
515type signature of the record.
516
517</ul>
518
519Two files are generated for each record file (note: not for each record). If a
520record file is named "name.jr", the generated files are
521"name.jr.cc" and "name.jr.hh" containing serialization
522implementations and record type declarations respectively.
523
524For each record in the DDL file, the generated header file will contain a
525class definition corresponding to the record type, method definitions for the
526generated type will be present in the '.cc' file.  The generated class will
527inherit from the abstract class hadoop::Record. The DDL files
528module declaration determines the namespace the record belongs to.
529Each '.' delimited token in the module declaration results in the
530creation of a namespace. For instance, the declaration module docs.links
531results in the creation of a docs namespace and a nested
532docs::links namespace. In the preceding examples, the Link class
533is placed in the links namespace. The header file corresponding to
534the links.jr file will contain:
535
536<pre><code>
537namespace links {
538  class Link : public hadoop::Record {
539    // ....
540  };
541};
542</code></pre>
543
544Each field within the record will cause the generation of a private member
545declaration of the appropriate type in the class declaration, and one or more
546acccessor methods. The generated class will implement the serialize and
547deserialize methods defined in hadoop::Record+. It will also
548implement the inspection methods type and signature from
549hadoop::Record. A default constructor and virtual destructor will also
550be generated. Serialization code will read/write records into streams that
551implement the hadoop::InStream and the hadoop::OutStream interfaces.
552
553For each member of a record an accessor method is generated that returns
554either the member or a reference to the member. For members that are returned
555by value, a setter method is also generated. This is true for primitive
556data members of the types byte, int, long, boolean, float and
557double. For example, for a int field called MyField the folowing
558code is generated.
559
560<pre><code>
561...
562private:
563  int32_t mMyField;
564  ...
565public:
566  int32_t getMyField(void) const {
567    return mMyField;
568  };
569
570  void setMyField(int32_t m) {
571    mMyField = m;
572  };
573  ...
574</code></pre>
575
576For a ustring or buffer or composite field. The generated code
577only contains accessors that return a reference to the field. A const
578and a non-const accessor are generated. For example:
579
580<pre><code>
581...
582private:
583  std::string mMyBuf;
584  ...
585public:
586
587  std::string& getMyBuf() {
588    return mMyBuf;
589  };
590
591  const std::string& getMyBuf() const {
592    return mMyBuf;
593  };
594  ...
595</code></pre>
596
597<h4>Examples</h4>
598
599Suppose the inclrec.jr file contains:
600<pre><code>
601module inclrec {
602    class RI {
603        int      I32;
604        double   D;
605        ustring  S;
606    };
607}
608</code></pre>
609
610and the testrec.jr file contains:
611
612<pre><code>
613include "inclrec.jr"
614module testrec {
615    class R {
616        vector<float> VF;
617        RI            Rec;
618        buffer        Buf;
619    };
620}
621</code></pre>
622
623Then the invocation of rcc such as:
624<pre><code>
625$ rcc -l c++ inclrec.jr testrec.jr
626</code></pre>
627will result in generation of four files:
628inclrec.jr.{cc,hh} and testrec.jr.{cc,hh}.
629
630The inclrec.jr.hh will contain:
631
632<pre><code>
633#ifndef _INCLREC_JR_HH_
634#define _INCLREC_JR_HH_
635
636#include "recordio.hh"
637
638namespace inclrec {
639 
640  class RI : public hadoop::Record {
641
642  private:
643
644    int32_t      I32;
645    double       D;
646    std::string  S;
647
648  public:
649
650    RI(void);
651    virtual ~RI(void);
652
653    virtual bool operator==(const RI& peer) const;
654    virtual bool operator<(const RI& peer) const;
655
656    virtual int32_t getI32(void) const { return I32; }
657    virtual void setI32(int32_t v) { I32 = v; }
658
659    virtual double getD(void) const { return D; }
660    virtual void setD(double v) { D = v; }
661
662    virtual std::string& getS(void) const { return S; }
663    virtual const std::string& getS(void) const { return S; }
664
665    virtual std::string type(void) const;
666    virtual std::string signature(void) const;
667
668  protected:
669
670    virtual void serialize(hadoop::OArchive& a) const;
671    virtual void deserialize(hadoop::IArchive& a);
672  };
673} // end namespace inclrec
674
675#endif /* _INCLREC_JR_HH_ */
676
677</code></pre>
678
679The testrec.jr.hh file will contain:
680
681
682<pre><code>
683
684#ifndef _TESTREC_JR_HH_
685#define _TESTREC_JR_HH_
686
687#include "inclrec.jr.hh"
688
689namespace testrec {
690  class R : public hadoop::Record {
691
692  private:
693
694    std::vector<float> VF;
695    inclrec::RI        Rec;
696    std::string        Buf;
697
698  public:
699
700    R(void);
701    virtual ~R(void);
702
703    virtual bool operator==(const R& peer) const;
704    virtual bool operator<(const R& peer) const;
705
706    virtual std::vector<float>& getVF(void) const;
707    virtual const std::vector<float>& getVF(void) const;
708
709    virtual std::string& getBuf(void) const ;
710    virtual const std::string& getBuf(void) const;
711
712    virtual inclrec::RI& getRec(void) const;
713    virtual const inclrec::RI& getRec(void) const;
714   
715    virtual bool serialize(hadoop::OutArchive& a) const;
716    virtual bool deserialize(hadoop::InArchive& a);
717   
718    virtual std::string type(void) const;
719    virtual std::string signature(void) const;
720  };
721}; // end namespace testrec
722#endif /* _TESTREC_JR_HH_ */
723
724</code></pre>
725
726<h3>Java</h3>
727
728Code generation for Java is similar to that for C++. A Java class is generated
729for each record type with private members corresponding to the fields. Getters
730and setters for fields are also generated. Some differences arise in the
731way comparison is expressed and in the mapping of modules to packages and
732classes to files. For equality testing, an equals method is generated
733for each record type. As per Java requirements a hashCode method is also
734generated. For comparison a compareTo method is generated for each
735record type. This has the semantics as defined by the Java Comparable
736interface, that is, the method returns a negative integer, zero, or a positive
737integer as the invoked object is less than, equal to, or greater than the
738comparison parameter.
739
740A .java file is generated per record type as opposed to per DDL
741file as in C++. The module declaration translates to a Java
742package declaration. The module name maps to an identical Java package
743name. In addition to this mapping, the DDL compiler creates the appropriate
744directory hierarchy for the package and places the generated .java
745files in the correct directories.
746
747<h2>Mapping Summary</h2>
748
749<pre><code>
750DDL Type        C++ Type            Java Type
751
752boolean         bool                boolean
753byte            int8_t              byte
754int             int32_t             int
755long            int64_t             long
756float           float               float
757double          double              double
758ustring         std::string         java.lang.String
759buffer          std::string         org.apache.hadoop.record.Buffer
760class type      class type          class type
761vector<type>    std::vector<type>   java.util.ArrayList<type>
762map<type,type>  std::map<type,type> java.util.TreeMap<type,type>
763</code></pre>
764
765<h2>Data encodings</h2>
766
767This section describes the format of the data encodings supported by Hadoop.
768Currently, three data encodings are supported, namely binary, CSV and XML.
769
770<h3>Binary Serialization Format</h3>
771
772The binary data encoding format is fairly dense. Serialization of composite
773types is simply defined as a concatenation of serializations of the constituent
774elements (lengths are included in vectors and maps).
775
776Composite types are serialized as follows:
777<ul>
778<li> class: Sequence of serialized members.
779<li> vector: The number of elements serialized as an int. Followed by a
780sequence of serialized elements.
781<li> map: The number of key value pairs serialized as an int. Followed
782by a sequence of serialized (key,value) pairs.
783</ul>
784
785Serialization of primitives is more interesting, with a zero compression
786optimization for integral types and normalization to UTF-8 for strings.
787Primitive types are serialized as follows:
788
789<ul>
790<li> byte: Represented by 1 byte, as is.
791<li> boolean: Represented by 1-byte (0 or 1)
792<li> int/long: Integers and longs are serialized zero compressed.
793Represented as 1-byte if -120 <= value < 128. Otherwise, serialized as a
794sequence of 2-5 bytes for ints, 2-9 bytes for longs. The first byte represents
795the number of trailing bytes, N, as the negative number (-120-N). For example,
796the number 1024 (0x400) is represented by the byte sequence 'x86 x04 x00'.
797This doesn't help much for 4-byte integers but does a reasonably good job with
798longs without bit twiddling.
799<li> float/double: Serialized in IEEE 754 single and double precision
800format in network byte order. This is the format used by Java.
801<li> ustring: Serialized as 4-byte zero compressed length followed by
802data encoded as UTF-8. Strings are normalized to UTF-8 regardless of native
803language representation.
804<li> buffer: Serialized as a 4-byte zero compressed length followed by the
805raw bytes in the buffer.
806</ul>
807
808
809<h3>CSV Serialization Format</h3>
810
811The CSV serialization format has a lot more structure than the "standard"
812Excel CSV format, but we believe the additional structure is useful because
813
814<ul>
815<li> it makes parsing a lot easier without detracting too much from legibility
816<li> the delimiters around composites make it obvious when one is reading a
817sequence of Hadoop records
818</ul>
819
820Serialization formats for the various types are detailed in the grammar that
821follows. The notable feature of the formats is the use of delimiters for
822indicating the certain field types.
823
824<ul>
825<li> A string field begins with a single quote (').
826<li> A buffer field begins with a sharp (#).
827<li> A class, vector or map begins with 's{', 'v{' or 'm{' respectively and
828ends with '}'.
829</ul>
830
831The CSV format can be described by the following grammar:
832
833<pre><code>
834record = primitive / struct / vector / map
835primitive = boolean / int / long / float / double / ustring / buffer
836
837boolean = "T" / "F"
838int = ["-"] 1*DIGIT
839long = ";" ["-"] 1*DIGIT
840float = ["-"] 1*DIGIT "." 1*DIGIT ["E" / "e" ["-"] 1*DIGIT]
841double = ";" ["-"] 1*DIGIT "." 1*DIGIT ["E" / "e" ["-"] 1*DIGIT]
842
843ustring = "'" *(UTF8 char except NULL, LF, % and , / "%00" / "%0a" / "%25" / "%2c" )
844
845buffer = "#" *(BYTE except NULL, LF, % and , / "%00" / "%0a" / "%25" / "%2c" )
846
847struct = "s{" record *("," record) "}"
848vector = "v{" [record *("," record)] "}"
849map = "m{" [*(record "," record)] "}"
850</code></pre>
851
852<h3>XML Serialization Format</h3>
853
854The XML serialization format is the same used by Apache XML-RPC
855(http://ws.apache.org/xmlrpc/types.html). This is an extension of the original
856XML-RPC format and adds some additional data types. All record I/O types are
857not directly expressible in this format, and access to a DDL is required in
858order to convert these to valid types. All types primitive or composite are
859represented by &lt;value&gt; elements. The particular XML-RPC type is
860indicated by a nested element in the &lt;value&gt; element. The encoding for
861records is always UTF-8. Primitive types are serialized as follows:
862
863<ul>
864<li> byte: XML tag &lt;ex:i1&gt;. Values: 1-byte unsigned
865integers represented in US-ASCII
866<li> boolean: XML tag &lt;boolean&gt;. Values: "0" or "1"
867<li> int: XML tags &lt;i4&gt; or &lt;int&gt;. Values: 4-byte
868signed integers represented in US-ASCII.
869<li> long: XML tag &lt;ex:i8&gt;. Values: 8-byte signed integers
870represented in US-ASCII.
871<li> float: XML tag &lt;ex:float&gt;. Values: Single precision
872floating point numbers represented in US-ASCII.
873<li> double: XML tag &lt;double&gt;. Values: Double precision
874floating point numbers represented in US-ASCII.
875<li> ustring: XML tag &lt;;string&gt;. Values: String values
876represented as UTF-8. XML does not permit all Unicode characters in literal
877data. In particular, NULLs and control chars are not allowed. Additionally,
878XML processors are required to replace carriage returns with line feeds and to
879replace CRLF sequences with line feeds. Programming languages that we work
880with do not impose these restrictions on string types. To work around these
881restrictions, disallowed characters and CRs are percent escaped in strings.
882The '%' character is also percent escaped.
883<li> buffer: XML tag &lt;string&&gt;. Values: Arbitrary binary
884data. Represented as hexBinary, each byte is replaced by its 2-byte
885hexadecimal representation.
886</ul>
887
888Composite types are serialized as follows:
889
890<ul>
891<li> class: XML tag &lt;struct&gt;. A struct is a sequence of
892&lt;member&gt; elements. Each &lt;member&gt; element has a &lt;name&gt;
893element and a &lt;value&gt; element. The &lt;name&gt; is a string that must
894match /[a-zA-Z][a-zA-Z0-9_]*/. The value of the member is represented
895by a &lt;value&gt; element.
896
897<li> vector: XML tag &lt;array&lt;. An &lt;array&gt; contains a
898single &lt;data&gt; element. The &lt;data&gt; element is a sequence of
899&lt;value&gt; elements each of which represents an element of the vector.
900
901<li> map: XML tag &lt;array&gt;. Same as vector.
902
903</ul>
904
905For example:
906
907<pre><code>
908class {
909  int           MY_INT;            // value 5
910  vector<float> MY_VEC;            // values 0.1, -0.89, 2.45e4
911  buffer        MY_BUF;            // value '\00\n\tabc%'
912}
913</code></pre>
914
915is serialized as
916
917<pre><code class="XML">
918&lt;value&gt;
919  &lt;struct&gt;
920    &lt;member&gt;
921      &lt;name&gt;MY_INT&lt;/name&gt;
922      &lt;value&gt;&lt;i4&gt;5&lt;/i4&gt;&lt;/value&gt;
923    &lt;/member&gt;
924    &lt;member&gt;
925      &lt;name&gt;MY_VEC&lt;/name&gt;
926      &lt;value&gt;
927        &lt;array&gt;
928          &lt;data&gt;
929            &lt;value&gt;&lt;ex:float&gt;0.1&lt;/ex:float&gt;&lt;/value&gt;
930            &lt;value&gt;&lt;ex:float&gt;-0.89&lt;/ex:float&gt;&lt;/value&gt;
931            &lt;value&gt;&lt;ex:float&gt;2.45e4&lt;/ex:float&gt;&lt;/value&gt;
932          &lt;/data&gt;
933        &lt;/array&gt;
934      &lt;/value&gt;
935    &lt;/member&gt;
936    &lt;member&gt;
937      &lt;name&gt;MY_BUF&lt;/name&gt;
938      &lt;value&gt;&lt;string&gt;%00\n\tabc%25&lt;/string&gt;&lt;/value&gt;
939    &lt;/member&gt;
940  &lt;/struct&gt;
941&lt;/value&gt; 
942</code></pre>
943<P>
944
945<P>
946<DL>
947</DL>
948<HR>
949
950
951<!-- ======= START OF BOTTOM NAVBAR ====== -->
952<A NAME="navbar_bottom"><!-- --></A>
953<A HREF="#skip-navbar_bottom" title="Skip navigation links"></A>
954<TABLE BORDER="0" WIDTH="100%" CELLPADDING="1" CELLSPACING="0" SUMMARY="">
955<TR>
956<TD COLSPAN=2 BGCOLOR="#EEEEFF" CLASS="NavBarCell1">
957<A NAME="navbar_bottom_firstrow"><!-- --></A>
958<TABLE BORDER="0" CELLPADDING="0" CELLSPACING="3" SUMMARY="">
959  <TR ALIGN="center" VALIGN="top">
960  <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1">    <A HREF="../../../../overview-summary.html"><FONT CLASS="NavBarFont1"><B>Overview</B></FONT></A>&nbsp;</TD>
961  <TD BGCOLOR="#FFFFFF" CLASS="NavBarCell1Rev"> &nbsp;<FONT CLASS="NavBarFont1Rev"><B>Package</B></FONT>&nbsp;</TD>
962  <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1">    <FONT CLASS="NavBarFont1">Class</FONT>&nbsp;</TD>
963  <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1">    <A HREF="package-use.html"><FONT CLASS="NavBarFont1"><B>Use</B></FONT></A>&nbsp;</TD>
964  <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1">    <A HREF="package-tree.html"><FONT CLASS="NavBarFont1"><B>Tree</B></FONT></A>&nbsp;</TD>
965  <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1">    <A HREF="../../../../deprecated-list.html"><FONT CLASS="NavBarFont1"><B>Deprecated</B></FONT></A>&nbsp;</TD>
966  <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1">    <A HREF="../../../../index-all.html"><FONT CLASS="NavBarFont1"><B>Index</B></FONT></A>&nbsp;</TD>
967  <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1">    <A HREF="../../../../help-doc.html"><FONT CLASS="NavBarFont1"><B>Help</B></FONT></A>&nbsp;</TD>
968  </TR>
969</TABLE>
970</TD>
971<TD ALIGN="right" VALIGN="top" ROWSPAN=3><EM>
972</EM>
973</TD>
974</TR>
975
976<TR>
977<TD BGCOLOR="white" CLASS="NavBarCell2"><FONT SIZE="-2">
978&nbsp;<A HREF="../../../../org/apache/hadoop/net/package-summary.html"><B>PREV PACKAGE</B></A>&nbsp;
979&nbsp;<A HREF="../../../../org/apache/hadoop/record/compiler/package-summary.html"><B>NEXT PACKAGE</B></A></FONT></TD>
980<TD BGCOLOR="white" CLASS="NavBarCell2"><FONT SIZE="-2">
981  <A HREF="../../../../index.html?org/apache/hadoop/record/package-summary.html" target="_top"><B>FRAMES</B></A>  &nbsp;
982&nbsp;<A HREF="package-summary.html" target="_top"><B>NO FRAMES</B></A>  &nbsp;
983&nbsp;<SCRIPT type="text/javascript">
984  <!--
985  if(window==top) {
986    document.writeln('<A HREF="../../../../allclasses-noframe.html"><B>All Classes</B></A>');
987  }
988  //-->
989</SCRIPT>
990<NOSCRIPT>
991  <A HREF="../../../../allclasses-noframe.html"><B>All Classes</B></A>
992</NOSCRIPT>
993
994
995</FONT></TD>
996</TR>
997</TABLE>
998<A NAME="skip-navbar_bottom"></A>
999<!-- ======== END OF BOTTOM NAVBAR ======= -->
1000
1001<HR>
1002Copyright &copy; 2009 The Apache Software Foundation
1003</BODY>
1004</HTML>
Note: See TracBrowser for help on using the repository browser.