All pastes #2110932 Raw Edit

asdasd

public java v1 · immutable
#2110932 ·published 2012-02-07 20:10 UTC
rendered paste body
package scraper;import java.io.BufferedReader;import java.io.FileNotFoundException;import java.io.FileReader;import java.io.IOException;import java.io.InputStreamReader;import org.htmlcleaner.CleanerProperties;import org.htmlcleaner.HtmlCleaner;import org.htmlcleaner.PrettyXmlSerializer;import org.htmlcleaner.SimpleHtmlSerializer;import org.htmlcleaner.TagNode;public class Main {	public static String clean(String filename) throws IOException{		FileReader input = new FileReader(filename);		BufferedReader bufRead = new BufferedReader(input);		String websiteSource = "";                System.out.println("Reading input file,this may take a while...");		String line = bufRead.readLine();		while( line != null){			websiteSource += line+"\r\n";			line = bufRead.readLine();		}		HtmlCleaner cleaner = new HtmlCleaner();		CleanerProperties props = cleaner.getProperties();		props.setOmitComments(true);		props.setUseEmptyElementTags(false);                System.out.println("Cleaning the file...");		TagNode node = cleaner.clean(websiteSource);                System.out.println("Saving the file...");                new PrettyXmlSerializer(props).writeToFile(node, filename+".clean.xml", "utf-8");		bufRead.close();		input.close();		return filename+".clean.xml";	}	        ///Users/philippenadeau/Desktop/freq.html	public static void main(String[] args){		try {                    if(args.length == 0){                        System.out.println("Usage: java -jar scraper.jar <HTML absolute path>");                        System.exit(1);                    }                    //String file = args[0];                    String cleanedFile = clean(args[0]);		}catch (FileNotFoundException e) {                    e.printStackTrace();                    System.out.println("Unable to find the file.");                    System.exit(1);		} catch (IOException e) {                    e.printStackTrace();                    System.out.println("Problem reading/writing the file.");                    System.exit(1);		}	}	}