Tutorial Join

25
Hpot-Tech 1 Joins Create a java project JoinMap and create the following classes:

description

Joins

Transcript of Tutorial Join

  • Hpot-Tech

    1 Joins

    Create a java project JoinMap and create the following classes:

  • Hpot-Tech

    2 Joins

    package com.hp.join; // == JobBuilder import java.io.IOException;

    import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; import org.apache.hadoop.util.Tool;

    public class JobBuilder {

    private final Class driverClass; private final Job job; private final int extraArgCount; private final String extrArgsUsage;

    private String[] extraArgs;

    public JobBuilder(Class driverClass) throws IOException { this(driverClass, 0, ""); }

    public JobBuilder(Class driverClass, int extraArgCount, String extrArgsUsage) throws IOException { this.driverClass = driverClass; this.extraArgCount = extraArgCount; this.job = new Job(); this.job.setJarByClass(driverClass); this.extrArgsUsage = extrArgsUsage; }

    // vv JobBuilder public static Job parseInputAndOutput(Tool tool, Configuration conf, String[] args) throws IOException {

    if (args.length != 2) { printUsage(tool, " "); return null;

  • Hpot-Tech

    3 Joins

    } Job job = new Job(conf); job.setJarByClass(tool.getClass()); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); return job; }

    public static void printUsage(Tool tool, String extraArgsUsage) { System.err.printf("Usage: %s [genericOptions] %s\n\n", tool.getClass().getSimpleName(), extraArgsUsage); GenericOptionsParser.printGenericCommandUsage(System.err); } // ^^ JobBuilder

    public JobBuilder withCommandLineArgs(String... args) throws IOException { Configuration conf = job.getConfiguration(); GenericOptionsParser parser = new GenericOptionsParser(conf, args); String[] otherArgs = parser.getRemainingArgs(); if (otherArgs.length < 2 && otherArgs.length > 3 + extraArgCount) { System.err.printf("Usage: %s [genericOptions] [-overwrite] %s\n\n", driverClass.getSimpleName(), extrArgsUsage); GenericOptionsParser.printGenericCommandUsage(System.err); System.exit(-1); } int index = 0; boolean overwrite = false; if (otherArgs[index].equals("-overwrite")) { overwrite = true; index++; } Path input = new Path(otherArgs[index++]); Path output = new Path(otherArgs[index++]);

    if (index < otherArgs.length) { extraArgs = new String[otherArgs.length - index]; System.arraycopy(otherArgs, index, extraArgs, 0, otherArgs.length - index); }

    if (overwrite) {

  • Hpot-Tech

    4 Joins

    output.getFileSystem(conf).delete(output, true); }

    FileInputFormat.addInputPath(job, input); FileOutputFormat.setOutputPath(job, output); return this; }

    public Job build() { return job; }

    public String[] getExtraArgs() { return extraArgs; } }

  • Hpot-Tech

    5 Joins

    package com.hp.join;

    import java.io.IOException;

    import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter;

    public class JoinRecordMapper extends MapReduceBase implements Mapper { private NcdcRecordParser parser = new NcdcRecordParser();

    public void map(LongWritable key, Text value, OutputCollector output, Reporter reporter) throws IOException {

    parser.parse(value); output.collect(new TextPair(parser.getStationId(), "1"), value); } }

  • Hpot-Tech

    6 Joins

    package com.hp.join;

    import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.*; import org.apache.hadoop.mapred.lib.MultipleInputs; import org.apache.hadoop.util.*; @SuppressWarnings("deprecation") public class JoinRecordWithStationName extends Configured implements Tool {

    public static class KeyPartitioner implements Partitioner { @Override public void configure(JobConf job) {}

    @Override public int getPartition(TextPair key, Text value, int numPartitions) { return (key.getFirst().hashCode() & Integer.MAX_VALUE) % numPartitions; } }

    @Override public int run(String[] args) throws Exception { if (args.length != 3) { JobBuilder.printUsage(this, " "); return -1; }

    JobConf conf = new JobConf(getConf(), getClass()); conf.setJobName("Join record with station name");

    Path ncdcInputPath = new Path(args[0]); Path stationInputPath = new Path(args[1]); Path outputPath = new Path(args[2]);

    MultipleInputs.addInputPath(conf, ncdcInputPath, TextInputFormat.class, JoinRecordMapper.class); MultipleInputs.addInputPath(conf, stationInputPath, TextInputFormat.class, JoinStationMapper.class);

  • Hpot-Tech

    7 Joins

    FileOutputFormat.setOutputPath(conf, outputPath);

    conf.setPartitionerClass(KeyPartitioner.class); conf.setOutputValueGroupingComparator(TextPair.FirstComparator.class);

    conf.setMapOutputKeyClass(TextPair.class);

    conf.setReducerClass(JoinReducer.class);

    conf.setOutputKeyClass(Text.class);

    JobClient.runJob(conf); return 0; }

    public static void main(String[] args) throws Exception { args = new String[3]; args[0] = "inputncdc"; args[1] = "inputstation"; args[2] = "output"+System.currentTimeMillis();

    int exitCode = ToolRunner.run(new JoinRecordWithStationName(), args); System.exit(exitCode); } }

  • Hpot-Tech

    8 Joins

    package com.hp.join;

    import java.io.IOException; import java.util.Iterator;

    import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.*;

    public class JoinReducer extends MapReduceBase implements Reducer {

    public void reduce(TextPair key, Iterator values, OutputCollector output, Reporter reporter) throws IOException {

    Text stationName = new Text(values.next()); while (values.hasNext()) { Text record = values.next(); Text outValue = new Text(stationName.toString() + "\t" + record.toString()); output.collect(key.getFirst(), outValue); } } }

  • Hpot-Tech

    9 Joins

    package com.hp.join;

    import java.io.IOException;

    import org.apache.hadoop.io.*; import org.apache.hadoop.mapred.*;

    public class JoinStationMapper extends MapReduceBase implements Mapper { private NcdcStationMetadataParser parser = new NcdcStationMetadataParser();

    public void map(LongWritable key, Text value, OutputCollector output, Reporter reporter) throws IOException {

    if (parser.parse(value)) { output.collect(new TextPair(parser.getStationId(), "0"), new Text(parser.getStationName())); } } }

  • Hpot-Tech

    10 Joins

    package com.hp.join; import java.math.*; import org.apache.hadoop.io.Text;

    public class MetOfficeRecordParser {

    private String year; private String airTemperatureString; private int airTemperature; private boolean airTemperatureValid;

    public void parse(String record) { if (record.length() < 18) { return; } year = record.substring(3, 7); if (isValidRecord(year)) { airTemperatureString = record.substring(13, 18); if (!airTemperatureString.trim().equals("---")) { BigDecimal temp = new BigDecimal(airTemperatureString.trim()); temp = temp.multiply(new BigDecimal(BigInteger.TEN)); airTemperature = temp.intValueExact(); airTemperatureValid = true; } } }

    private boolean isValidRecord(String year) { try { Integer.parseInt(year); return true; } catch (NumberFormatException e) { return false; } }

    public void parse(Text record) { parse(record.toString()); }

  • Hpot-Tech

    11 Joins

    public String getYear() { return year; }

    public int getAirTemperature() { return airTemperature; }

    public String getAirTemperatureString() { return airTemperatureString; }

    public boolean isValidTemperature() { return airTemperatureValid; }

    }

  • Hpot-Tech

    12 Joins

    package com.hp.join; import java.text.*; import java.util.Date;

    import org.apache.hadoop.io.Text;

    public class NcdcRecordParser {

    private static final int MISSING_TEMPERATURE = 9999;

    private static final DateFormat DATE_FORMAT = new SimpleDateFormat("yyyyMMddHHmm");

    private String stationId; private String observationDateString; private String year; private String airTemperatureString; private int airTemperature; private boolean airTemperatureMalformed; private String quality;

    public void parse(String record) { stationId = record.substring(4, 10) + "-" + record.substring(10, 15); observationDateString = record.substring(15, 27); year = record.substring(15, 19); airTemperatureMalformed = false; // Remove leading plus sign as parseInt doesn't like them if (record.charAt(87) == '+') { airTemperatureString = record.substring(88, 92); airTemperature = Integer.parseInt(airTemperatureString); } else if (record.charAt(87) == '-') { airTemperatureString = record.substring(87, 92); airTemperature = Integer.parseInt(airTemperatureString); } else { airTemperatureMalformed = true; } airTemperature = Integer.parseInt(airTemperatureString); quality = record.substring(92, 93); }

  • Hpot-Tech

    13 Joins

    public void parse(Text record) { parse(record.toString()); }

    public boolean isValidTemperature() { return !airTemperatureMalformed && airTemperature != MISSING_TEMPERATURE && quality.matches("[01459]"); }

    public boolean isMalformedTemperature() { return airTemperatureMalformed; }

    public boolean isMissingTemperature() { return airTemperature == MISSING_TEMPERATURE; }

    public String getStationId() { return stationId; }

    public Date getObservationDate() { try { System.out.println(observationDateString); return DATE_FORMAT.parse(observationDateString); } catch (ParseException e) { throw new IllegalArgumentException(e); } }

    public String getYear() { return year; }

    public int getYearInt() { return Integer.parseInt(year); }

    public int getAirTemperature() { return airTemperature;

  • Hpot-Tech

    14 Joins

    }

    public String getAirTemperatureString() { return airTemperatureString; }

    public String getQuality() { return quality; }

    }

  • Hpot-Tech

    15 Joins

    package com.hp.join; import java.io.*; import java.util.*; import org.apache.hadoop.io.IOUtils;

    public class NcdcStationMetadata {

    private Map stationIdToName = new HashMap();

    public void initialize(File file) throws IOException { BufferedReader in = null; try { in = new BufferedReader(new InputStreamReader(new FileInputStream(file))); NcdcStationMetadataParser parser = new NcdcStationMetadataParser(); String line; while ((line = in.readLine()) != null) { if (parser.parse(line)) { stationIdToName.put(parser.getStationId(), parser.getStationName()); } } } finally { IOUtils.closeStream(in); } }

    public String getStationName(String stationId) { String stationName = stationIdToName.get(stationId); if (stationName == null || stationName.trim().length() == 0) { return stationId; // no match: fall back to ID } return stationName; }

    public Map getStationIdToNameMap() { return Collections.unmodifiableMap(stationIdToName); }

    }

  • Hpot-Tech

    16 Joins

    package com.hp.join; import org.apache.hadoop.io.Text;

    public class NcdcStationMetadataParser {

    private String stationId; private String stationName;

    public boolean parse(String record) { if (record.length() < 42) { // header return false; } String usaf = record.substring(0, 6); String wban = record.substring(7, 12); stationId = usaf + "-" + wban; stationName = record.substring(13, 42); try { Integer.parseInt(usaf); // USAF identifiers are numeric return true; } catch (NumberFormatException e) { return false; } }

    public boolean parse(Text record) { return parse(record.toString()); }

    public String getStationId() { return stationId; }

    public String getStationName() { return stationName; }

    }

  • Hpot-Tech

    17 Joins

    package com.hp.join; // cc TextPair A Writable implementation that stores a pair of Text objects // cc TextPairComparator A RawComparator for comparing TextPair byte representations // cc TextPairFirstComparator A custom RawComparator for comparing the first field of TextPair byte representations // vv TextPair import java.io.*;

    import org.apache.hadoop.io.*;

    public class TextPair implements WritableComparable {

    private Text first; private Text second;

    public TextPair() { set(new Text(), new Text()); }

    public TextPair(String first, String second) { set(new Text(first), new Text(second)); }

    public TextPair(Text first, Text second) { set(first, second); }

    public void set(Text first, Text second) { this.first = first; this.second = second; }

    public Text getFirst() { return first; }

    public Text getSecond() { return second; }

    @Override

  • Hpot-Tech

    18 Joins

    public void write(DataOutput out) throws IOException { first.write(out); second.write(out); }

    @Override public void readFields(DataInput in) throws IOException { first.readFields(in); second.readFields(in); }

    @Override public int hashCode() { return first.hashCode() * 163 + second.hashCode(); }

    @Override public boolean equals(Object o) { if (o instanceof TextPair) { TextPair tp = (TextPair) o; return first.equals(tp.first) && second.equals(tp.second); } return false; }

    @Override public String toString() { return first + "\t" + second; }

    @Override public int compareTo(TextPair tp) { int cmp = first.compareTo(tp.first); if (cmp != 0) { return cmp; } return second.compareTo(tp.second); } // ^^ TextPair

  • Hpot-Tech

    19 Joins

    // vv TextPairComparator public static class Comparator extends WritableComparator {

    private static final Text.Comparator TEXT_COMPARATOR = new Text.Comparator();

    public Comparator() { super(TextPair.class); }

    @Override public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {

    try { int firstL1 = WritableUtils.decodeVIntSize(b1[s1]) + readVInt(b1, s1); int firstL2 = WritableUtils.decodeVIntSize(b2[s2]) + readVInt(b2, s2); int cmp = TEXT_COMPARATOR.compare(b1, s1, firstL1, b2, s2, firstL2); if (cmp != 0) { return cmp; } return TEXT_COMPARATOR.compare(b1, s1 + firstL1, l1 - firstL1, b2, s2 + firstL2, l2 - firstL2); } catch (IOException e) { throw new IllegalArgumentException(e); } } }

    static { WritableComparator.define(TextPair.class, new Comparator()); } // ^^ TextPairComparator

    // vv TextPairFirstComparator public static class FirstComparator extends WritableComparator {

    private static final Text.Comparator TEXT_COMPARATOR = new Text.Comparator();

    public FirstComparator() { super(TextPair.class);

  • Hpot-Tech

    20 Joins

    }

    @Override public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {

    try { int firstL1 = WritableUtils.decodeVIntSize(b1[s1]) + readVInt(b1, s1); int firstL2 = WritableUtils.decodeVIntSize(b2[s2]) + readVInt(b2, s2); return TEXT_COMPARATOR.compare(b1, s1, firstL1, b2, s2, firstL2); } catch (IOException e) { throw new IllegalArgumentException(e); } }

    @Override public int compare(WritableComparable a, WritableComparable b) { if (a instanceof TextPair && b instanceof TextPair) { return ((TextPair) a).first.compareTo(((TextPair) b).first); } return super.compare(a, b); } } // ^^ TextPairFirstComparator

    // vv TextPair } // ^^ TextPair

  • Hpot-Tech

    21 Joins

    Create the following folder and copy the file:

  • Hpot-Tech

    22 Joins

  • Hpot-Tech

    23 Joins

    Run the application:

  • Hpot-Tech

    24 Joins

    Submit the jar in cluster:

    Export the jar and submit as follows:

    Create the necessary input folders:

    Un common the path initialization as follow:

    /*args = new String[3]; args[0] = "inputncdc"; args[1] = "inputstation"; args[2] = "output"+System.currentTimeMillis();*/

    #hadoop fs -mkdir incdc/

    #hadoop fs -mkdir instation/

    #hadoop fs -copyFromLocal /hadoop/data/sample.txt incdc/

    #hadoop fs -copyFromLocal /hadoop/data/stations*.txt instation/

    #hadoop jar /hadoop/hadoop/myhadoopjoin.jar com.hp.join.JoinRecordWithStationName incdc instation outputs

  • Hpot-Tech

    25 Joins

    You can view the data as follows: