Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

package-summary.html @ 120

Last change on this file since 120 was 120, checked in by (none), 14 years ago
Added the mail files for the Hadoop JUNit Project
Property svn:executable set to ``*
File size: 36.0 KB

Line
1	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
2	<!--NewPage-->
3	<HTML>
4	<HEAD>
5	<!-- Generated by javadoc (build 1.6.0_07) on Tue Sep 01 20:57:00 UTC 2009 -->
6	<TITLE>
7	org.apache.hadoop.record (Hadoop 0.20.1 API)
8	</TITLE>
9
10	<META NAME="date" CONTENT="2009-09-01">
11
12	<LINK REL ="stylesheet" TYPE="text/css" HREF="../../../../stylesheet.css" TITLE="Style">
13
14	<SCRIPT type="text/javascript">
15	function windowTitle()
16	{
17	if (location.href.indexOf('is-external=true') == -1) {
18	parent.document.title="org.apache.hadoop.record (Hadoop 0.20.1 API)";
19	}
20	}
21	</SCRIPT>
22	<NOSCRIPT>
23	</NOSCRIPT>
24
25	</HEAD>
26
27	<BODY BGCOLOR="white" onload="windowTitle();">
28	<HR>
29
30
31	<!-- ========= START OF TOP NAVBAR ======= -->
32	<A NAME="navbar_top"><!-- --></A>
33	<A HREF="#skip-navbar_top" title="Skip navigation links"></A>
34	<TABLE BORDER="0" WIDTH="100%" CELLPADDING="1" CELLSPACING="0" SUMMARY="">
35	<TR>
36	<TD COLSPAN=2 BGCOLOR="#EEEEFF" CLASS="NavBarCell1">
37	<A NAME="navbar_top_firstrow"><!-- --></A>
38	<TABLE BORDER="0" CELLPADDING="0" CELLSPACING="3" SUMMARY="">
39	<TR ALIGN="center" VALIGN="top">
40	<TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="../../../../overview-summary.html"><FONT CLASS="NavBarFont1"><B>Overview</B></FONT></A> </TD>
41	<TD BGCOLOR="#FFFFFF" CLASS="NavBarCell1Rev">  <FONT CLASS="NavBarFont1Rev"><B>Package</B></FONT> </TD>
42	<TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <FONT CLASS="NavBarFont1">Class</FONT> </TD>
43	<TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="package-use.html"><FONT CLASS="NavBarFont1"><B>Use</B></FONT></A> </TD>
44	<TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="package-tree.html"><FONT CLASS="NavBarFont1"><B>Tree</B></FONT></A> </TD>
45	<TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="../../../../deprecated-list.html"><FONT CLASS="NavBarFont1"><B>Deprecated</B></FONT></A> </TD>
46	<TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="../../../../index-all.html"><FONT CLASS="NavBarFont1"><B>Index</B></FONT></A> </TD>
47	<TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="../../../../help-doc.html"><FONT CLASS="NavBarFont1"><B>Help</B></FONT></A> </TD>
48	</TR>
49	</TABLE>
50	</TD>
51	<TD ALIGN="right" VALIGN="top" ROWSPAN=3><EM>
52	</EM>
53	</TD>
54	</TR>
55
56	<TR>
57	<TD BGCOLOR="white" CLASS="NavBarCell2"><FONT SIZE="-2">
58	<A HREF="../../../../org/apache/hadoop/net/package-summary.html"><B>PREV PACKAGE</B></A>
59	<A HREF="../../../../org/apache/hadoop/record/compiler/package-summary.html"><B>NEXT PACKAGE</B></A></FONT></TD>
60	<TD BGCOLOR="white" CLASS="NavBarCell2"><FONT SIZE="-2">
61	<A HREF="../../../../index.html?org/apache/hadoop/record/package-summary.html" target="_top"><B>FRAMES</B></A>
62	<A HREF="package-summary.html" target="_top"><B>NO FRAMES</B></A>
63	<SCRIPT type="text/javascript">
64	<!--
65	if(window==top) {
66	document.writeln('<A HREF="../../../../allclasses-noframe.html"><B>All Classes</B></A>');
67	}
68	//-->
69	</SCRIPT>
70	<NOSCRIPT>
71	<A HREF="../../../../allclasses-noframe.html"><B>All Classes</B></A>
72	</NOSCRIPT>
73
74
75	</FONT></TD>
76	</TR>
77	</TABLE>
78	<A NAME="skip-navbar_top"></A>
79	<!-- ========= END OF TOP NAVBAR ========= -->
80
81	<HR>
82	<H2>
83	Package org.apache.hadoop.record
84	</H2>
85	Hadoop record I/O contains classes and a record description language
86	translator for simplifying serialization and deserialization of records in a
87	language-neutral manner.
88	<P>
89	<B>See:</B>
90	<BR>
91	<A HREF="#package_description"><B>Description</B></A>
92	<P>
93
94	<TABLE BORDER="1" WIDTH="100%" CELLPADDING="3" CELLSPACING="0" SUMMARY="">
95	<TR BGCOLOR="#CCCCFF" CLASS="TableHeadingColor">
96	<TH ALIGN="left" COLSPAN="2"><FONT SIZE="+2">
97	<B>Interface Summary</B></FONT></TH>
98	</TR>
99	<TR BGCOLOR="white" CLASS="TableRowColor">
100	<TD WIDTH="15%"><B><A HREF="../../../../org/apache/hadoop/record/Index.html" title="interface in org.apache.hadoop.record">Index</A></B></TD>
101	<TD>Interface that acts as an iterator for deserializing maps.</TD>
102	</TR>
103	<TR BGCOLOR="white" CLASS="TableRowColor">
104	<TD WIDTH="15%"><B><A HREF="../../../../org/apache/hadoop/record/RecordInput.html" title="interface in org.apache.hadoop.record">RecordInput</A></B></TD>
105	<TD>Interface that all the Deserializers have to implement.</TD>
106	</TR>
107	<TR BGCOLOR="white" CLASS="TableRowColor">
108	<TD WIDTH="15%"><B><A HREF="../../../../org/apache/hadoop/record/RecordOutput.html" title="interface in org.apache.hadoop.record">RecordOutput</A></B></TD>
109	<TD>Interface that alll the serializers have to implement.</TD>
110	</TR>
111	</TABLE>
112
113
114	<P>
115
116	<TABLE BORDER="1" WIDTH="100%" CELLPADDING="3" CELLSPACING="0" SUMMARY="">
117	<TR BGCOLOR="#CCCCFF" CLASS="TableHeadingColor">
118	<TH ALIGN="left" COLSPAN="2"><FONT SIZE="+2">
119	<B>Class Summary</B></FONT></TH>
120	</TR>
121	<TR BGCOLOR="white" CLASS="TableRowColor">
122	<TD WIDTH="15%"><B><A HREF="../../../../org/apache/hadoop/record/BinaryRecordInput.html" title="class in org.apache.hadoop.record">BinaryRecordInput</A></B></TD>
123	<TD> </TD>
124	</TR>
125	<TR BGCOLOR="white" CLASS="TableRowColor">
126	<TD WIDTH="15%"><B><A HREF="../../../../org/apache/hadoop/record/BinaryRecordOutput.html" title="class in org.apache.hadoop.record">BinaryRecordOutput</A></B></TD>
127	<TD> </TD>
128	</TR>
129	<TR BGCOLOR="white" CLASS="TableRowColor">
130	<TD WIDTH="15%"><B><A HREF="../../../../org/apache/hadoop/record/Buffer.html" title="class in org.apache.hadoop.record">Buffer</A></B></TD>
131	<TD>A byte sequence that is used as a Java native type for buffer.</TD>
132	</TR>
133	<TR BGCOLOR="white" CLASS="TableRowColor">
134	<TD WIDTH="15%"><B><A HREF="../../../../org/apache/hadoop/record/CsvRecordInput.html" title="class in org.apache.hadoop.record">CsvRecordInput</A></B></TD>
135	<TD> </TD>
136	</TR>
137	<TR BGCOLOR="white" CLASS="TableRowColor">
138	<TD WIDTH="15%"><B><A HREF="../../../../org/apache/hadoop/record/CsvRecordOutput.html" title="class in org.apache.hadoop.record">CsvRecordOutput</A></B></TD>
139	<TD> </TD>
140	</TR>
141	<TR BGCOLOR="white" CLASS="TableRowColor">
142	<TD WIDTH="15%"><B><A HREF="../../../../org/apache/hadoop/record/Record.html" title="class in org.apache.hadoop.record">Record</A></B></TD>
143	<TD>Abstract class that is extended by generated classes.</TD>
144	</TR>
145	<TR BGCOLOR="white" CLASS="TableRowColor">
146	<TD WIDTH="15%"><B><A HREF="../../../../org/apache/hadoop/record/RecordComparator.html" title="class in org.apache.hadoop.record">RecordComparator</A></B></TD>
147	<TD>A raw record comparator base class</TD>
148	</TR>
149	<TR BGCOLOR="white" CLASS="TableRowColor">
150	<TD WIDTH="15%"><B><A HREF="../../../../org/apache/hadoop/record/Utils.html" title="class in org.apache.hadoop.record">Utils</A></B></TD>
151	<TD>Various utility functions for Hadooop record I/O runtime.</TD>
152	</TR>
153	<TR BGCOLOR="white" CLASS="TableRowColor">
154	<TD WIDTH="15%"><B><A HREF="../../../../org/apache/hadoop/record/XmlRecordInput.html" title="class in org.apache.hadoop.record">XmlRecordInput</A></B></TD>
155	<TD>XML Deserializer.</TD>
156	</TR>
157	<TR BGCOLOR="white" CLASS="TableRowColor">
158	<TD WIDTH="15%"><B><A HREF="../../../../org/apache/hadoop/record/XmlRecordOutput.html" title="class in org.apache.hadoop.record">XmlRecordOutput</A></B></TD>
159	<TD>XML Serializer.</TD>
160	</TR>
161	</TABLE>
162
163
164	<P>
165	<A NAME="package_description"><!-- --></A><H2>
166	Package org.apache.hadoop.record Description
167	</H2>
168
169	<P>
170	Hadoop record I/O contains classes and a record description language
171	translator for simplifying serialization and deserialization of records in a
172	language-neutral manner.
173
174	<h2>Introduction</h2>
175
176	Software systems of any significant complexity require mechanisms for data
177	interchange with the outside world. These interchanges typically involve the
178	marshaling and unmarshaling of logical units of data to and from data streams
179	(files, network connections, memory buffers etc.). Applications usually have
180	some code for serializing and deserializing the data types that they manipulate
181	embedded in them. The work of serialization has several features that make
182	automatic code generation for it worthwhile. Given a particular output encoding
183	(binary, XML, etc.), serialization of primitive types and simple compositions
184	of primitives (structs, vectors etc.) is a very mechanical task. Manually
185	written serialization code can be susceptible to bugs especially when records
186	have a large number of fields or a record definition changes between software
187	versions. Lastly, it can be very useful for applications written in different
188	programming languages to be able to share and interchange data. This can be
189	made a lot easier by describing the data records manipulated by these
190	applications in a language agnostic manner and using the descriptions to derive
191	implementations of serialization in multiple target languages.
192
193	This document describes Hadoop Record I/O, a mechanism that is aimed
194	at
195	<ul>
196	<li> enabling the specification of simple serializable data types (records)
197	<li> enabling the generation of code in multiple target languages for
198	marshaling and unmarshaling such types
199	<li> providing target language specific support that will enable application
200	programmers to incorporate generated code into their applications
201	</ul>
202
203	The goals of Hadoop Record I/O are similar to those of mechanisms such as XDR,
204	ASN.1, PADS and ICE. While these systems all include a DDL that enables
205	the specification of most record types, they differ widely in what else they
206	focus on. The focus in Hadoop Record I/O is on data marshaling and
207	multi-lingual support. We take a translator-based approach to serialization.
208	Hadoop users have to describe their data in a simple data description
209	language. The Hadoop DDL translator rcc generates code that users
210	can invoke in order to read/write their data from/to simple stream
211	abstractions. Next we list explicitly some of the goals and non-goals of
212	Hadoop Record I/O.
213
214
215	<h3>Goals</h3>
216
217	<ul>
218	<li> Support for commonly used primitive types. Hadoop should include as
219	primitives commonly used builtin types from programming languages we intend to
220	support.
221
222	<li> Support for common data compositions (including recursive compositions).
223	Hadoop should support widely used composite types such as structs and
224	vectors.
225
226	<li> Code generation in multiple target languages. Hadoop should be capable of
227	generating serialization code in multiple target languages and should be
228	easily extensible to new target languages. The initial target languages are
229	C++ and Java.
230
231	<li> Support for generated target languages. Hadooop should include support
232	in the form of headers, libraries, packages for supported target languages
233	that enable easy inclusion and use of generated code in applications.
234
235	<li> Support for multiple output encodings. Candidates include
236	packed binary, comma-separated text, XML etc.
237
238	<li> Support for specifying record types in a backwards/forwards compatible
239	manner. This will probably be in the form of support for optional fields in
240	records. This version of the document does not include a description of the
241	planned mechanism, we intend to include it in the next iteration.
242
243	</ul>
244
245	<h3>Non-Goals</h3>
246
247	<ul>
248	<li> Serializing existing arbitrary C++ classes.
249	<li> Serializing complex data structures such as trees, linked lists etc.
250	<li> Built-in indexing schemes, compression, or check-sums.
251	<li> Dynamic construction of objects from an XML schema.
252	</ul>
253
254	The remainder of this document describes the features of Hadoop record I/O
255	in more detail. Section 2 describes the data types supported by the system.
256	Section 3 lays out the DDL syntax with some examples of simple records.
257	Section 4 describes the process of code generation with rcc. Section 5
258	describes target language mappings and support for Hadoop types. We include a
259	fairly complete description of C++ mappings with intent to include Java and
260	others in upcoming iterations of this document. The last section talks about
261	supported output encodings.
262
263
264	<h2>Data Types and Streams</h2>
265
266	This section describes the primitive and composite types supported by Hadoop.
267	We aim to support a set of types that can be used to simply and efficiently
268	express a wide range of record types in different programming languages.
269
270	<h3>Primitive Types</h3>
271
272	For the most part, the primitive types of Hadoop map directly to primitive
273	types in high level programming languages. Special cases are the
274	ustring (a Unicode string) and buffer types, which we believe
275	find wide use and which are usually implemented in library code and not
276	available as language built-ins. Hadoop also supplies these via library code
277	when a target language built-in is not present and there is no widely
278	adopted "standard" implementation. The complete list of primitive types is:
279
280	<ul>
281	<li> byte: An 8-bit unsigned integer.
282	<li> boolean: A boolean value.
283	<li> int: A 32-bit signed integer.
284	<li> long: A 64-bit signed integer.
285	<li> float: A single precision floating point number as described by
286	IEEE-754.
287	<li> double: A double precision floating point number as described by
288	IEEE-754.
289	<li> ustring: A string consisting of Unicode characters.
290	<li> buffer: An arbitrary sequence of bytes.
291	</ul>
292
293
294	<h3>Composite Types</h3>
295	Hadoop supports a small set of composite types that enable the description
296	of simple aggregate types and containers. A composite type is serialized
297	by sequentially serializing it constituent elements. The supported
298	composite types are:
299
300	<ul>
301
302	<li> record: An aggregate type like a C-struct. This is a list of
303	typed fields that are together considered a single unit of data. A record
304	is serialized by sequentially serializing its constituent fields. In addition
305	to serialization a record has comparison operations (equality and less-than)
306	implemented for it, these are defined as memberwise comparisons.
307
308	<li>vector: A sequence of entries of the same data type, primitive
309	or composite.
310
311	<li> map: An associative container mapping instances of a key type to
312	instances of a value type. The key and value types may themselves be primitive
313	or composite types.
314
315	</ul>
316
317	<h3>Streams</h3>
318
319	Hadoop generates code for serializing and deserializing record types to
320	abstract streams. For each target language Hadoop defines very simple input
321	and output stream interfaces. Application writers can usually develop
322	concrete implementations of these by putting a one method wrapper around
323	an existing stream implementation.
324
325
326	<h2>DDL Syntax and Examples</h2>
327
328	We now describe the syntax of the Hadoop data description language. This is
329	followed by a few examples of DDL usage.
330
331	<h3>Hadoop DDL Syntax</h3>
332
333	<pre><code>
334	recfile = include module record
335	include = "include" path
336	path = (relative-path / absolute-path)
337	module = "module" module-name
338	module-name = name *("." name)
339	record := "class" name "{" 1*(field) "}"
340	field := type name ";"
341	name := ALPHA (ALPHA / DIGIT / "_" )*
342	type := (ptype / ctype)
343	ptype := ("byte" / "boolean" / "int" \|
344	"long" / "float" / "double"
345	"ustring" / "buffer")
346	ctype := (("vector" "<" type ">") /
347	("map" "<" type "," type ">" ) ) / name)
348	</code></pre>
349
350	A DDL file describes one or more record types. It begins with zero or
351	more include declarations, a single mandatory module declaration
352	followed by zero or more class declarations. The semantics of each of
353	these declarations are described below:
354
355	<ul>
356
357	<li>include: An include declaration specifies a DDL file to be
358	referenced when generating code for types in the current DDL file. Record types
359	in the current compilation unit may refer to types in all included files.
360	File inclusion is recursive. An include does not trigger code
361	generation for the referenced file.
362
363	<li> module: Every Hadoop DDL file must have a single module
364	declaration that follows the list of includes and precedes all record
365	declarations. A module declaration identifies a scope within which
366	the names of all types in the current file are visible. Module names are
367	mapped to C++ namespaces, Java packages etc. in generated code.
368
369	<li> class: Records types are specified through class
370	declarations. A class declaration is like a Java class declaration.
371	It specifies a named record type and a list of fields that constitute records
372	of the type. Usage is illustrated in the following examples.
373
374	</ul>
375
376	<h3>Examples</h3>
377
378	<ul>
379	<li>A simple DDL file links.jr with just one record declaration.
380	<pre><code>
381	module links {
382	class Link {
383	ustring URL;
384	boolean isRelative;
385	ustring anchorText;
386	};
387	}
388	</code></pre>
389
390	<li> A DDL file outlinks.jr which includes another
391	<pre><code>
392	include "links.jr"
393
394	module outlinks {
395	class OutLinks {
396	ustring baseURL;
397	vector<links.Link> outLinks;
398	};
399	}
400	</code></pre>
401	</ul>
402
403	<h2>Code Generation</h2>
404
405	The Hadoop translator is written in Java. Invocation is done by executing a
406	wrapper shell script named named rcc. It takes a list of
407	record description files as a mandatory argument and an
408	optional language argument (the default is Java) --language or
409	-l. Thus a typical invocation would look like:
410	<pre><code>
411	$ rcc -l C++ <filename> ...
412	</code></pre>
413
414
415	<h2>Target Language Mappings and Support</h2>
416
417	For all target languages, the unit of code generation is a record type.
418	For each record type, Hadoop generates code for serialization and
419	deserialization, record comparison and access to record members.
420
421	<h3>C++</h3>
422
423	Support for including Hadoop generated C++ code in applications comes in the
424	form of a header file recordio.hh which needs to be included in source
425	that uses Hadoop types and a library librecordio.a which applications need
426	to be linked with. The header declares the Hadoop C++ namespace which defines
427	appropriate types for the various primitives, the basic interfaces for
428	records and streams and enumerates the supported serialization encodings.
429	Declarations of these interfaces and a description of their semantics follow:
430
431	<pre><code>
432	namespace hadoop {
433
434	enum RecFormat { kBinary, kXML, kCSV };
435
436	class InStream {
437	public:
438	virtual ssize_t read(void *buf, size_t n) = 0;
439	};
440
441	class OutStream {
442	public:
443	virtual ssize_t write(const void *buf, size_t n) = 0;
444	};
445
446	class IOError : public runtime_error {
447	public:
448	explicit IOError(const std::string& msg);
449	};
450
451	class IArchive;
452	class OArchive;
453
454	class RecordReader {
455	public:
456	RecordReader(InStream& in, RecFormat fmt);
457	virtual ~RecordReader(void);
458
459	virtual void read(Record& rec);
460	};
461
462	class RecordWriter {
463	public:
464	RecordWriter(OutStream& out, RecFormat fmt);
465	virtual ~RecordWriter(void);
466
467	virtual void write(Record& rec);
468	};
469
470
471	class Record {
472	public:
473	virtual std::string type(void) const = 0;
474	virtual std::string signature(void) const = 0;
475	protected:
476	virtual bool validate(void) const = 0;
477
478	virtual void
479	serialize(OArchive& oa, const std::string& tag) const = 0;
480
481	virtual void
482	deserialize(IArchive& ia, const std::string& tag) = 0;
483	};
484	}
485	</code></pre>
486
487	<ul>
488
489	<li> RecFormat: An enumeration of the serialization encodings supported
490	by this implementation of Hadoop.
491
492	<li> InStream: A simple abstraction for an input stream. This has a
493	single public read method that reads n bytes from the stream into
494	the buffer buf. Has the same semantics as a blocking read system
495	call. Returns the number of bytes read or -1 if an error occurs.
496
497	<li> OutStream: A simple abstraction for an output stream. This has a
498	single write method that writes n bytes to the stream from the
499	buffer buf. Has the same semantics as a blocking write system
500	call. Returns the number of bytes written or -1 if an error occurs.
501
502	<li> RecordReader: A RecordReader reads records one at a time from
503	an underlying stream in a specified record format. The reader is instantiated
504	with a stream and a serialization format. It has a read method that
505	takes an instance of a record and deserializes the record from the stream.
506
507	<li> RecordWriter: A RecordWriter writes records one at a
508	time to an underlying stream in a specified record format. The writer is
509	instantiated with a stream and a serialization format. It has a
510	write method that takes an instance of a record and serializes the
511	record to the stream.
512
513	<li> Record: The base class for all generated record types. This has two
514	public methods type and signature that return the typename and the
515	type signature of the record.
516
517	</ul>
518
519	Two files are generated for each record file (note: not for each record). If a
520	record file is named "name.jr", the generated files are
521	"name.jr.cc" and "name.jr.hh" containing serialization
522	implementations and record type declarations respectively.
523
524	For each record in the DDL file, the generated header file will contain a
525	class definition corresponding to the record type, method definitions for the
526	generated type will be present in the '.cc' file. The generated class will
527	inherit from the abstract class hadoop::Record. The DDL files
528	module declaration determines the namespace the record belongs to.
529	Each '.' delimited token in the module declaration results in the
530	creation of a namespace. For instance, the declaration module docs.links
531	results in the creation of a docs namespace and a nested
532	docs::links namespace. In the preceding examples, the Link class
533	is placed in the links namespace. The header file corresponding to
534	the links.jr file will contain:
535
536	<pre><code>
537	namespace links {
538	class Link : public hadoop::Record {
539	// ....
540	};
541	};
542	</code></pre>
543
544	Each field within the record will cause the generation of a private member
545	declaration of the appropriate type in the class declaration, and one or more
546	acccessor methods. The generated class will implement the serialize and
547	deserialize methods defined in hadoop::Record+. It will also
548	implement the inspection methods type and signature from
549	hadoop::Record. A default constructor and virtual destructor will also
550	be generated. Serialization code will read/write records into streams that
551	implement the hadoop::InStream and the hadoop::OutStream interfaces.
552
553	For each member of a record an accessor method is generated that returns
554	either the member or a reference to the member. For members that are returned
555	by value, a setter method is also generated. This is true for primitive
556	data members of the types byte, int, long, boolean, float and
557	double. For example, for a int field called MyField the folowing
558	code is generated.
559
560	<pre><code>
561	...
562	private:
563	int32_t mMyField;
564	...
565	public:
566	int32_t getMyField(void) const {
567	return mMyField;
568	};
569
570	void setMyField(int32_t m) {
571	mMyField = m;
572	};
573	...
574	</code></pre>
575
576	For a ustring or buffer or composite field. The generated code
577	only contains accessors that return a reference to the field. A const
578	and a non-const accessor are generated. For example:
579
580	<pre><code>
581	...
582	private:
583	std::string mMyBuf;
584	...
585	public:
586
587	std::string& getMyBuf() {
588	return mMyBuf;
589	};
590
591	const std::string& getMyBuf() const {
592	return mMyBuf;
593	};
594	...
595	</code></pre>
596
597	<h4>Examples</h4>
598
599	Suppose the inclrec.jr file contains:
600	<pre><code>
601	module inclrec {
602	class RI {
603	int I32;
604	double D;
605	ustring S;
606	};
607	}
608	</code></pre>
609
610	and the testrec.jr file contains:
611
612	<pre><code>
613	include "inclrec.jr"
614	module testrec {
615	class R {
616	vector<float> VF;
617	RI Rec;
618	buffer Buf;
619	};
620	}
621	</code></pre>
622
623	Then the invocation of rcc such as:
624	<pre><code>
625	$ rcc -l c++ inclrec.jr testrec.jr
626	</code></pre>
627	will result in generation of four files:
628	inclrec.jr.{cc,hh} and testrec.jr.{cc,hh}.
629
630	The inclrec.jr.hh will contain:
631
632	<pre><code>
633	#ifndef _INCLREC_JR_HH_
634	#define _INCLREC_JR_HH_
635
636	#include "recordio.hh"
637
638	namespace inclrec {
639
640	class RI : public hadoop::Record {
641
642	private:
643
644	int32_t I32;
645	double D;
646	std::string S;
647
648	public:
649
650	RI(void);
651	virtual ~RI(void);
652
653	virtual bool operator==(const RI& peer) const;
654	virtual bool operator<(const RI& peer) const;
655
656	virtual int32_t getI32(void) const { return I32; }
657	virtual void setI32(int32_t v) { I32 = v; }
658
659	virtual double getD(void) const { return D; }
660	virtual void setD(double v) { D = v; }
661
662	virtual std::string& getS(void) const { return S; }
663	virtual const std::string& getS(void) const { return S; }
664
665	virtual std::string type(void) const;
666	virtual std::string signature(void) const;
667
668	protected:
669
670	virtual void serialize(hadoop::OArchive& a) const;
671	virtual void deserialize(hadoop::IArchive& a);
672	};
673	} // end namespace inclrec
674
675	#endif /* _INCLREC_JR_HH_ */
676
677	</code></pre>
678
679	The testrec.jr.hh file will contain:
680
681
682	<pre><code>
683
684	#ifndef _TESTREC_JR_HH_
685	#define _TESTREC_JR_HH_
686
687	#include "inclrec.jr.hh"
688
689	namespace testrec {
690	class R : public hadoop::Record {
691
692	private:
693
694	std::vector<float> VF;
695	inclrec::RI Rec;
696	std::string Buf;
697
698	public:
699
700	R(void);
701	virtual ~R(void);
702
703	virtual bool operator==(const R& peer) const;
704	virtual bool operator<(const R& peer) const;
705
706	virtual std::vector<float>& getVF(void) const;
707	virtual const std::vector<float>& getVF(void) const;
708
709	virtual std::string& getBuf(void) const ;
710	virtual const std::string& getBuf(void) const;
711
712	virtual inclrec::RI& getRec(void) const;
713	virtual const inclrec::RI& getRec(void) const;
714
715	virtual bool serialize(hadoop::OutArchive& a) const;
716	virtual bool deserialize(hadoop::InArchive& a);
717
718	virtual std::string type(void) const;
719	virtual std::string signature(void) const;
720	};
721	}; // end namespace testrec
722	#endif /* _TESTREC_JR_HH_ */
723
724	</code></pre>
725
726	<h3>Java</h3>
727
728	Code generation for Java is similar to that for C++. A Java class is generated
729	for each record type with private members corresponding to the fields. Getters
730	and setters for fields are also generated. Some differences arise in the
731	way comparison is expressed and in the mapping of modules to packages and
732	classes to files. For equality testing, an equals method is generated
733	for each record type. As per Java requirements a hashCode method is also
734	generated. For comparison a compareTo method is generated for each
735	record type. This has the semantics as defined by the Java Comparable
736	interface, that is, the method returns a negative integer, zero, or a positive
737	integer as the invoked object is less than, equal to, or greater than the
738	comparison parameter.
739
740	A .java file is generated per record type as opposed to per DDL
741	file as in C++. The module declaration translates to a Java
742	package declaration. The module name maps to an identical Java package
743	name. In addition to this mapping, the DDL compiler creates the appropriate
744	directory hierarchy for the package and places the generated .java
745	files in the correct directories.
746
747	<h2>Mapping Summary</h2>
748
749	<pre><code>
750	DDL Type C++ Type Java Type
751
752	boolean bool boolean
753	byte int8_t byte
754	int int32_t int
755	long int64_t long
756	float float float
757	double double double
758	ustring std::string java.lang.String
759	buffer std::string org.apache.hadoop.record.Buffer
760	class type class type class type
761	vector<type> std::vector<type> java.util.ArrayList<type>
762	map<type,type> std::map<type,type> java.util.TreeMap<type,type>
763	</code></pre>
764
765	<h2>Data encodings</h2>
766
767	This section describes the format of the data encodings supported by Hadoop.
768	Currently, three data encodings are supported, namely binary, CSV and XML.
769
770	<h3>Binary Serialization Format</h3>
771
772	The binary data encoding format is fairly dense. Serialization of composite
773	types is simply defined as a concatenation of serializations of the constituent
774	elements (lengths are included in vectors and maps).
775
776	Composite types are serialized as follows:
777	<ul>
778	<li> class: Sequence of serialized members.
779	<li> vector: The number of elements serialized as an int. Followed by a
780	sequence of serialized elements.
781	<li> map: The number of key value pairs serialized as an int. Followed
782	by a sequence of serialized (key,value) pairs.
783	</ul>
784
785	Serialization of primitives is more interesting, with a zero compression
786	optimization for integral types and normalization to UTF-8 for strings.
787	Primitive types are serialized as follows:
788
789	<ul>
790	<li> byte: Represented by 1 byte, as is.
791	<li> boolean: Represented by 1-byte (0 or 1)
792	<li> int/long: Integers and longs are serialized zero compressed.
793	Represented as 1-byte if -120 <= value < 128. Otherwise, serialized as a
794	sequence of 2-5 bytes for ints, 2-9 bytes for longs. The first byte represents
795	the number of trailing bytes, N, as the negative number (-120-N). For example,
796	the number 1024 (0x400) is represented by the byte sequence 'x86 x04 x00'.
797	This doesn't help much for 4-byte integers but does a reasonably good job with
798	longs without bit twiddling.
799	<li> float/double: Serialized in IEEE 754 single and double precision
800	format in network byte order. This is the format used by Java.
801	<li> ustring: Serialized as 4-byte zero compressed length followed by
802	data encoded as UTF-8. Strings are normalized to UTF-8 regardless of native
803	language representation.
804	<li> buffer: Serialized as a 4-byte zero compressed length followed by the
805	raw bytes in the buffer.
806	</ul>
807
808
809	<h3>CSV Serialization Format</h3>
810
811	The CSV serialization format has a lot more structure than the "standard"
812	Excel CSV format, but we believe the additional structure is useful because
813
814	<ul>
815	<li> it makes parsing a lot easier without detracting too much from legibility
816	<li> the delimiters around composites make it obvious when one is reading a
817	sequence of Hadoop records
818	</ul>
819
820	Serialization formats for the various types are detailed in the grammar that
821	follows. The notable feature of the formats is the use of delimiters for
822	indicating the certain field types.
823
824	<ul>
825	<li> A string field begins with a single quote (').
826	<li> A buffer field begins with a sharp (#).
827	<li> A class, vector or map begins with 's{', 'v{' or 'm{' respectively and
828	ends with '}'.
829	</ul>
830
831	The CSV format can be described by the following grammar:
832
833	<pre><code>
834	record = primitive / struct / vector / map
835	primitive = boolean / int / long / float / double / ustring / buffer
836
837	boolean = "T" / "F"
838	int = ["-"] 1*DIGIT
839	long = ";" ["-"] 1*DIGIT
840	float = ["-"] 1DIGIT "." 1DIGIT ["E" / "e" ["-"] 1*DIGIT]
841	double = ";" ["-"] 1DIGIT "." 1DIGIT ["E" / "e" ["-"] 1*DIGIT]
842
843	ustring = "'" *(UTF8 char except NULL, LF, % and , / "%00" / "%0a" / "%25" / "%2c" )
844
845	buffer = "#" *(BYTE except NULL, LF, % and , / "%00" / "%0a" / "%25" / "%2c" )
846
847	struct = "s{" record *("," record) "}"
848	vector = "v{" [record *("," record)] "}"
849	map = "m{" [*(record "," record)] "}"
850	</code></pre>
851
852	<h3>XML Serialization Format</h3>
853
854	The XML serialization format is the same used by Apache XML-RPC
855	(http://ws.apache.org/xmlrpc/types.html). This is an extension of the original
856	XML-RPC format and adds some additional data types. All record I/O types are
857	not directly expressible in this format, and access to a DDL is required in
858	order to convert these to valid types. All types primitive or composite are
859	represented by <value> elements. The particular XML-RPC type is
860	indicated by a nested element in the <value> element. The encoding for
861	records is always UTF-8. Primitive types are serialized as follows:
862
863	<ul>
864	<li> byte: XML tag <ex:i1>. Values: 1-byte unsigned
865	integers represented in US-ASCII
866	<li> boolean: XML tag <boolean>. Values: "0" or "1"
867	<li> int: XML tags <i4> or <int>. Values: 4-byte
868	signed integers represented in US-ASCII.
869	<li> long: XML tag <ex:i8>. Values: 8-byte signed integers
870	represented in US-ASCII.
871	<li> float: XML tag <ex:float>. Values: Single precision
872	floating point numbers represented in US-ASCII.
873	<li> double: XML tag <double>. Values: Double precision
874	floating point numbers represented in US-ASCII.
875	<li> ustring: XML tag <;string>. Values: String values
876	represented as UTF-8. XML does not permit all Unicode characters in literal
877	data. In particular, NULLs and control chars are not allowed. Additionally,
878	XML processors are required to replace carriage returns with line feeds and to
879	replace CRLF sequences with line feeds. Programming languages that we work
880	with do not impose these restrictions on string types. To work around these
881	restrictions, disallowed characters and CRs are percent escaped in strings.
882	The '%' character is also percent escaped.
883	<li> buffer: XML tag <string&>. Values: Arbitrary binary
884	data. Represented as hexBinary, each byte is replaced by its 2-byte
885	hexadecimal representation.
886	</ul>
887
888	Composite types are serialized as follows:
889
890	<ul>
891	<li> class: XML tag <struct>. A struct is a sequence of
892	<member> elements. Each <member> element has a <name>
893	element and a <value> element. The <name> is a string that must
894	match /[a-zA-Z][a-zA-Z0-9_]*/. The value of the member is represented
895	by a <value> element.
896
897	<li> vector: XML tag <array<. An <array> contains a
898	single <data> element. The <data> element is a sequence of
899	<value> elements each of which represents an element of the vector.
900
901	<li> map: XML tag <array>. Same as vector.
902
903	</ul>
904
905	For example:
906
907	<pre><code>
908	class {
909	int MY_INT; // value 5
910	vector<float> MY_VEC; // values 0.1, -0.89, 2.45e4
911	buffer MY_BUF; // value '\00\n\tabc%'
912	}
913	</code></pre>
914
915	is serialized as
916
917	<pre><code class="XML">
918	<value>
919	<struct>
920	<member>
921	<name>MY_INT</name>
922	<value><i4>5</i4></value>
923	</member>
924	<member>
925	<name>MY_VEC</name>
926	<value>
927	<array>
928	<data>
929	<value><ex:float>0.1</ex:float></value>
930	<value><ex:float>-0.89</ex:float></value>
931	<value><ex:float>2.45e4</ex:float></value>
932	</data>
933	</array>
934	</value>
935	</member>
936	<member>
937	<name>MY_BUF</name>
938	<value><string>%00\n\tabc%25</string></value>
939	</member>
940	</struct>
941	</value>
942	</code></pre>
943	<P>
944
945	<P>
946	<DL>
947	</DL>
948	<HR>
949
950
951	<!-- ======= START OF BOTTOM NAVBAR ====== -->
952	<A NAME="navbar_bottom"><!-- --></A>
953	<A HREF="#skip-navbar_bottom" title="Skip navigation links"></A>
954	<TABLE BORDER="0" WIDTH="100%" CELLPADDING="1" CELLSPACING="0" SUMMARY="">
955	<TR>
956	<TD COLSPAN=2 BGCOLOR="#EEEEFF" CLASS="NavBarCell1">
957	<A NAME="navbar_bottom_firstrow"><!-- --></A>
958	<TABLE BORDER="0" CELLPADDING="0" CELLSPACING="3" SUMMARY="">
959	<TR ALIGN="center" VALIGN="top">
960	<TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="../../../../overview-summary.html"><FONT CLASS="NavBarFont1"><B>Overview</B></FONT></A> </TD>
961	<TD BGCOLOR="#FFFFFF" CLASS="NavBarCell1Rev">  <FONT CLASS="NavBarFont1Rev"><B>Package</B></FONT> </TD>
962	<TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <FONT CLASS="NavBarFont1">Class</FONT> </TD>
963	<TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="package-use.html"><FONT CLASS="NavBarFont1"><B>Use</B></FONT></A> </TD>
964	<TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="package-tree.html"><FONT CLASS="NavBarFont1"><B>Tree</B></FONT></A> </TD>
965	<TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="../../../../deprecated-list.html"><FONT CLASS="NavBarFont1"><B>Deprecated</B></FONT></A> </TD>
966	<TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="../../../../index-all.html"><FONT CLASS="NavBarFont1"><B>Index</B></FONT></A> </TD>
967	<TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="../../../../help-doc.html"><FONT CLASS="NavBarFont1"><B>Help</B></FONT></A> </TD>
968	</TR>
969	</TABLE>
970	</TD>
971	<TD ALIGN="right" VALIGN="top" ROWSPAN=3><EM>
972	</EM>
973	</TD>
974	</TR>
975
976	<TR>
977	<TD BGCOLOR="white" CLASS="NavBarCell2"><FONT SIZE="-2">
978	<A HREF="../../../../org/apache/hadoop/net/package-summary.html"><B>PREV PACKAGE</B></A>
979	<A HREF="../../../../org/apache/hadoop/record/compiler/package-summary.html"><B>NEXT PACKAGE</B></A></FONT></TD>
980	<TD BGCOLOR="white" CLASS="NavBarCell2"><FONT SIZE="-2">
981	<A HREF="../../../../index.html?org/apache/hadoop/record/package-summary.html" target="_top"><B>FRAMES</B></A>
982	<A HREF="package-summary.html" target="_top"><B>NO FRAMES</B></A>
983	<SCRIPT type="text/javascript">
984	<!--
985	if(window==top) {
986	document.writeln('<A HREF="../../../../allclasses-noframe.html"><B>All Classes</B></A>');
987	}
988	//-->
989	</SCRIPT>
990	<NOSCRIPT>
991	<A HREF="../../../../allclasses-noframe.html"><B>All Classes</B></A>
992	</NOSCRIPT>
993
994
995	</FONT></TD>
996	</TR>
997	</TABLE>
998	<A NAME="skip-navbar_bottom"></A>
999	<!-- ======== END OF BOTTOM NAVBAR ======= -->
1000
1001	<HR>
1002	Copyright © 2009 The Apache Software Foundation
1003	</BODY>
1004	</HTML>

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: proiecte/HadoopJUnit/hadoop-0.20.1/docs/api/org/apache/hadoop/record/package-summary.html @ 120

Download in other formats: