rendered paste bodypackage scraper;import java.io.BufferedReader;import java.io.File;import java.io.FileNotFoundException;import java.io.FileReader;import java.io.IOException;import java.util.ArrayList;import java.util.logging.Level;import java.util.logging.Logger;import javax.xml.parsers.DocumentBuilder;import javax.xml.parsers.DocumentBuilderFactory;import javax.xml.parsers.ParserConfigurationException;import javax.xml.xpath.*;import org.htmlcleaner.*;import org.w3c.dom.Document;import org.w3c.dom.NamedNodeMap;import org.w3c.dom.Node;import org.w3c.dom.NodeList;import org.xml.sax.SAXException;/*<tr> <td width="53" align="center" background="images/bleu_p.gif"> <img border="0" src="images/feux_vert.gif" alt="En fonction" /> </td> <td width="103" align="left" valign="top" background="images/bleu_p.gif"> <font face="Arial" size="2"> <b>000-12</b> <br /> <a class="menu" href="recherche_rep_trunk.asp?t=Motorola Type I - A�ROPORT DE DORVAL">Motorola</a> </font> </td> <td width="458" align="left" valign="top" background="images/bleu_p.gif"> <font face="Arial" size="2"> <b>Commercial</b> - A�ROPORT DE DORVAL <br /> DORVAL - Qu�bec - Montr�al </font> </td> <td width="131" align="center" background="images/bleu_p.gif"> <font face="Arial" size="2"> <a href="info_id.asp?id=5027" target="_blank"> <img border="0" src="images/info.gif" alt="Voir les informations suppl�mentaires pour cette entr�e" /> </a> <a target="_blank" href="mod_id.asp?id=5027"> <img border="0" src="images/modif2.gif" alt="Modifier cette entr�e" /> </a> <a target="_blank" href="info_id.asp?id=5027#comm"> <img border="0" src="images/commentaire.gif" width="14" height="14" alt="Commentaire disponible" /> </a> </font> </td></tr>*/public class Scraper { /* stateVec contains Integers : * 0=Working; * 1=Not sure; * 2=Not working; * */ public static ArrayList stateVec = new ArrayList(); /*freqIdVec contains Strings describing the frequence or the ID * If typeVec == 0, freqIdVec is a frequency. If typeVec != 0, freqIdVec is an ID. */ public static ArrayList freqIdVec = new ArrayList(); /* typeVec contains Integers : -2=Not trunked, data+voice -1=Not trunked, data 0=Not trunked, voice Ericsson GE 1=EDACS Provoice 2=EDACS 3=GE Mark V Logic Trunked Radio 4=LTR Standard 5=LTR Passport 6=LTR Standard and Passport 7=LTR MultiNet 8=LTR-Net Motorola 9=Type I 10=Type II 11=Type IIi Hybrid 12=Type II SmartZone 13=Type II SmartZone OmniLink 14=iDEN (integrated Digital Enhanced Network) 15=Motorola Harmony (see iDEN) 16=MPT-1327 17=OpenSky System 18=APCO Project 16 19=APCO Project 25 20=SmarTrunk 21=TETRA 22=TETRAPOL 23=Kenwood NEXEDGE Digital trunked radio 24=Icom IDAS Digital trunked Land Mobile Radio 25=NXDN protocol * */ public static ArrayList typeVec = new ArrayList(); //serviceVec contains Strings describing the service (duh!) public static ArrayList serviceVec = new ArrayList(); //cityVec contains Strings describing the city. public static ArrayList cityVec = new ArrayList(); public static String clean(String filename) throws IOException{ System.out.println("Beginning cleaning operation!"); FileReader input = new FileReader(filename); BufferedReader bufRead = new BufferedReader(input); String websiteSource = ""; System.out.println("+ Reading input file,this may take a while..."); String line = bufRead.readLine(); while( line != null){ websiteSource += line+"\r\n"; line = bufRead.readLine(); } HtmlCleaner cleaner = new HtmlCleaner(); CleanerProperties props = cleaner.getProperties(); props.setOmitComments(true); props.setUseEmptyElementTags(false); System.out.println("+ Cleaning the file..."); TagNode node = cleaner.clean(websiteSource); System.out.println("+ Saving the file "+filename+".clean.xml"); new PrettyXmlSerializer(props).writeToFile(node, filename+".clean.xml", "utf-8"); bufRead.close(); input.close(); return filename+".clean.xml"; } public static void parse(String cleanedFile) throws FileNotFoundException, IOException , ParserConfigurationException, SAXException, XPathExpressionException { System.out.println("Beginning parsing operation!"); DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); factory.setNamespaceAware(true); DocumentBuilder builder; Document doc = null; XPathExpression expr = null; builder = factory.newDocumentBuilder(); System.out.println("+ Reading cleaned file,this may take a while..."); doc = (Document) builder.parse(new File(cleanedFile)); XPathFactory xFactory = XPathFactory.newInstance(); XPath xpath = xFactory.newXPath(); //state expr = xpath.compile("//td[@width='53']/img | //td[@width='53']/p/img"); Object result = expr.evaluate(doc, XPathConstants.NODESET); NodeList nodes = (NodeList) result; System.out.println("+ State: Scanning "+nodes.getLength()+" nodes"); for (int i=0; i<nodes.getLength();i++){ NamedNodeMap attributes = nodes.item(i).getAttributes(); Node node = attributes.getNamedItem("alt"); String str = node.getNodeValue().replaceAll("[\\W&&[^\\x20\\x2D]]", "").trim(); if(str.equals("En fonction")){ //its working System.out.println(i+": WORKS"); }else if(str.equals("valider si encore en opration")){ //maybe working System.out.println(i+": MAYBE works"); }else if(str.equals("Plus en opration")){ //not working System.out.println(i+": DONT works"); } } //Freq expr = xpath.compile("//td[@width='103']/font//text() | //td[@width='103']/font/b/text()"); result = expr.evaluate(doc, XPathConstants.NODESET); nodes = (NodeList) result; System.out.println("+ Freq/ID: Scanning "+nodes.getLength()+" nodes"); for (int i=0; i<nodes.getLength();i++){ //Escape everything that is not a A-z , 0-9, an hyphen or a whitespace String str = nodes.item(i).getNodeValue().replaceAll("[^\\d\\x2E\\x2D]", ""); if(str.length() > 1) System.out.println(i+": "+str); } /* //service expr = xpath.compile("//font/b/text()"); result = expr.evaluate(doc, XPathConstants.NODESET); nodes = (NodeList) result; System.out.println("+ Services: Scanning "+nodes.getLength()+" nodes"); for (int i=1; i<nodes.getLength();i+=2){ //Escape everything that is not a A-z , 0-9, an hyphen or a whitespace System.out.println(nodes.item(i).getNodeValue().replaceAll("[\\W&&[^\\x20\\x2D]]", "")); } //Trunking type if trunked expr = xpath.compile("//font/a/@href"); result = expr.evaluate(doc, XPathConstants.NODESET); nodes = (NodeList) result; System.out.println("+ Services: Scanning "+nodes.getLength()+" nodes"); for (int i=1; i<nodes.getLength();i+=2){ //Escape everything that is not a A-z , 0-9, an hyphen or a whitespace String str = nodes.item(i).getNodeValue().replaceAll("[\\W&&[^\\x20\\x2D]]", ""); if(str.length() > 1 && str.contains("trunk")){ str = str.replace("recherche_rep_trunkaspt", ""); System.out.println(str.substring(0, str.indexOf("-")-1)); } }*/ /*// New XPath expression to get the number of people with name lars expr = xpath.compile("count(//person[firstname='Lars'])"); // Run the query and get the number of nodes Double number = (Double) expr.evaluate(doc, XPathConstants.NUMBER); System.out.println("Number of objects " +number); // Do we have more then 2 people with name lars? expr = xpath.compile("count(//person[firstname='Lars']) >2"); // Run the query and get the number of nodes Boolean check = (Boolean) expr.evaluate(doc, XPathConstants.BOOLEAN); System.out.println(check);*/ } ///Users/philippenadeau/Desktop/freq.html public static void main(String[] args){ try { if(args.length == 0){ System.out.println("Usage: java -jar scraper.jar <HTML file absolute path>"); System.exit(1); } String cleanedFile = clean(args[0]); parse(cleanedFile); } catch (ParserConfigurationException ex) { System.out.println("Wrong parsing configuration."); System.exit(1); } catch (SAXException ex) { System.out.println("SAX problem."); System.exit(1); } catch (XPathExpressionException ex) { System.out.println("Wrong XPath."); System.exit(1); }catch (FileNotFoundException e) { System.out.println("Unable to find the file."); System.exit(1); } catch (IOException e) { System.out.println("Problem reading/writing the file."); System.exit(1); } } }