rendered paste bodypackage scraper;import java.io.BufferedReader;import java.io.FileNotFoundException;import java.io.FileReader;import java.io.IOException;import java.io.InputStreamReader;import org.htmlcleaner.CleanerProperties;import org.htmlcleaner.HtmlCleaner;import org.htmlcleaner.PrettyXmlSerializer;import org.htmlcleaner.SimpleHtmlSerializer;import org.htmlcleaner.TagNode;public class Main { public static String clean(String filename) throws IOException{ FileReader input = new FileReader(filename); BufferedReader bufRead = new BufferedReader(input); String websiteSource = ""; System.out.println("Reading input file,this may take a while..."); String line = bufRead.readLine(); while( line != null){ websiteSource += line+"\r\n"; line = bufRead.readLine(); } HtmlCleaner cleaner = new HtmlCleaner(); CleanerProperties props = cleaner.getProperties(); props.setOmitComments(true); props.setUseEmptyElementTags(false); System.out.println("Cleaning the file..."); TagNode node = cleaner.clean(websiteSource); System.out.println("Saving the file..."); new PrettyXmlSerializer(props).writeToFile(node, filename+".clean.xml", "utf-8"); bufRead.close(); input.close(); return filename+".clean.xml"; } ///Users/philippenadeau/Desktop/freq.html public static void main(String[] args){ try { if(args.length == 0){ System.out.println("Usage: java -jar scraper.jar <HTML absolute path>"); System.exit(1); } //String file = args[0]; String cleanedFile = clean(args[0]); }catch (FileNotFoundException e) { e.printStackTrace(); System.out.println("Unable to find the file."); System.exit(1); } catch (IOException e) { e.printStackTrace(); System.out.println("Problem reading/writing the file."); System.exit(1); } } }