Miscellany

public text v1 · immutable
#2120055
·published 2012-02-19 23:46 UTC
package org.commoncrawl.tutorial;

import static java.lang.System.err;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Scanner;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.sax.BodyContentHandler;

import org.commoncrawl.protocol.shared.ArcFileItem;

import org.jsoup.Jsoup;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

/**
 * Outputs all words contained within the displayed text of pages contained
 * within {@code ArcFileItem} objects.
 * 
 * @author Steve Salevan <steve.salevan@gmail.com>
 */
public class WordCountMapper extends MapReduceBase 
  implements Mapper<Text, ArcFileItem, Text, LongWritable> {

  public void map(Text key, ArcFileItem value,
      OutputCollector<Text, LongWritable> output, Reporter reporter)
      throws IOException {
    try {
    	if (!value.getMimeType().contains("pdf")) {
            return;  // Only parse text.
          }
    	ByteArrayInputStream inputStream = new ByteArrayInputStream(
    	          value.getContent().getReadOnlyBytes(), 0,
    	          value.getContent().getCount());
    	    
    	      // Converts InputStream to a String.
    	      //String content = new Scanner(inputStream).useDelimiter("\\A").next();
    	      String pageText = new String();
    	      try {
    	          pageText = parse(inputStream);
    	      } catch (Throwable t) {
    	          err.println("Could not parse document:" + t.getClass() + ":" + t.getMessage());
    	          t.printStackTrace(err);
    	      }

    	      // Parses HTML with a tolerant parser and extracts all text.
    	      //String pageText = Jsoup.parse(content).text();
    	      // Removes all punctuation.
    	      //pageText = pageText.replaceAll("[^a-zA-Z0-9 ]", "");
    	      // Normalizes whitespace to single spaces.
    	      //pageText = pageText.replaceAll("\\s+", " ");
    	      // Splits by space and outputs to OutputCollector.
    	      /*for (String word: pageText.split(" ")) {
    	        output.collect(new Text(word), new LongWritable(1));
    	      }*/
    	      output.collect(new Text(pageText), new LongWritable(1));
      /*if (!value.getMimeType().contains("text")) {
        return;  // Only parse text.
      }
      // Retrieves page content from the passed-in ArcFileItem.
      /*ByteArrayInputStream inputStream = new ByteArrayInputStream(
          value.getContent().getReadOnlyBytes(), 0,
          value.getContent().getCount());
      // Converts InputStream to a String.
      String content = new Scanner(inputStream).useDelimiter("\\A").next();
      // Parses HTML with a tolerant parser and extracts all text.
      String pageText = Jsoup.parse(content).text();
      // Removes all punctuation.
      pageText = pageText.replaceAll("[^a-zA-Z0-9 ]", "");
      // Normalizes whitespace to single spaces.
      pageText = pageText.replaceAll("\\s+", " ");
      // Splits by space and outputs to OutputCollector.
      for (String word: pageText.split(" ")) {
        output.collect(new Text(word), new LongWritable(1));
      }*/
      //output.collect(new Text(value.getMimeType()), new LongWritable(1));
    }
    catch (Exception e) {
      reporter.getCounter("WordCountMapper.exception",
          e.getClass().getSimpleName()).increment(1);
    }
  }
  private static String parse(ByteArrayInputStream input1) throws IOException,
  SAXException, TikaException {

	//<start id="main"/>
	InputStream input = input1;
	ContentHandler textHandler = new BodyContentHandler();
	Metadata metadata = new Metadata();
	PDFParser parser = new PDFParser();
	parser.parse(input, textHandler, metadata);
	input.close();
	return textHandler.toString();
  }
}