rendered paste bodypackage org.commoncrawl.tutorial;
import static java.lang.System.err;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Scanner;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.sax.BodyContentHandler;
import org.commoncrawl.protocol.shared.ArcFileItem;
import org.jsoup.Jsoup;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
/**
* Outputs all words contained within the displayed text of pages contained
* within {@code ArcFileItem} objects.
*
* @author Steve Salevan <steve.salevan@gmail.com>
*/
public class WordCountMapper extends MapReduceBase
implements Mapper<Text, ArcFileItem, Text, LongWritable> {
public void map(Text key, ArcFileItem value,
OutputCollector<Text, LongWritable> output, Reporter reporter)
throws IOException {
try {
if (!value.getMimeType().contains("pdf")) {
return; // Only parse text.
}
ByteArrayInputStream inputStream = new ByteArrayInputStream(
value.getContent().getReadOnlyBytes(), 0,
value.getContent().getCount());
// Converts InputStream to a String.
//String content = new Scanner(inputStream).useDelimiter("\\A").next();
String pageText = new String();
try {
pageText = parse(inputStream);
} catch (Throwable t) {
err.println("Could not parse document:" + t.getClass() + ":" + t.getMessage());
t.printStackTrace(err);
}
// Parses HTML with a tolerant parser and extracts all text.
//String pageText = Jsoup.parse(content).text();
// Removes all punctuation.
//pageText = pageText.replaceAll("[^a-zA-Z0-9 ]", "");
// Normalizes whitespace to single spaces.
//pageText = pageText.replaceAll("\\s+", " ");
// Splits by space and outputs to OutputCollector.
/*for (String word: pageText.split(" ")) {
output.collect(new Text(word), new LongWritable(1));
}*/
output.collect(new Text(pageText), new LongWritable(1));
/*if (!value.getMimeType().contains("text")) {
return; // Only parse text.
}
// Retrieves page content from the passed-in ArcFileItem.
/*ByteArrayInputStream inputStream = new ByteArrayInputStream(
value.getContent().getReadOnlyBytes(), 0,
value.getContent().getCount());
// Converts InputStream to a String.
String content = new Scanner(inputStream).useDelimiter("\\A").next();
// Parses HTML with a tolerant parser and extracts all text.
String pageText = Jsoup.parse(content).text();
// Removes all punctuation.
pageText = pageText.replaceAll("[^a-zA-Z0-9 ]", "");
// Normalizes whitespace to single spaces.
pageText = pageText.replaceAll("\\s+", " ");
// Splits by space and outputs to OutputCollector.
for (String word: pageText.split(" ")) {
output.collect(new Text(word), new LongWritable(1));
}*/
//output.collect(new Text(value.getMimeType()), new LongWritable(1));
}
catch (Exception e) {
reporter.getCounter("WordCountMapper.exception",
e.getClass().getSimpleName()).increment(1);
}
}
private static String parse(ByteArrayInputStream input1) throws IOException,
SAXException, TikaException {
//<start id="main"/>
InputStream input = input1;
ContentHandler textHandler = new BodyContentHandler();
Metadata metadata = new Metadata();
PDFParser parser = new PDFParser();
parser.parse(input, textHandler, metadata);
input.close();
return textHandler.toString();
}
}