All pastes #2113624 Raw Edit

asdasd

public java v1 · immutable
#2113624 ·published 2012-02-09 05:48 UTC
rendered paste body
package scraper;import java.io.BufferedReader;import java.io.File;import java.io.FileNotFoundException;import java.io.FileReader;import java.io.IOException;import java.util.ArrayList;import java.util.logging.Level;import java.util.logging.Logger;import javax.xml.parsers.DocumentBuilder;import javax.xml.parsers.DocumentBuilderFactory;import javax.xml.parsers.ParserConfigurationException;import javax.xml.xpath.*;import org.htmlcleaner.*;import org.w3c.dom.Document;import org.w3c.dom.NamedNodeMap;import org.w3c.dom.Node;import org.w3c.dom.NodeList;import org.xml.sax.SAXException;/*<tr>        <td width="53" align="center" background="images/bleu_p.gif">                <img border="0" src="images/feux_vert.gif" alt="En fonction" />        </td>        <td width="103" align="left" valign="top" background="images/bleu_p.gif">                <font face="Arial" size="2">                        <b>000-12</b>                        <br />                        <a class="menu" href="recherche_rep_trunk.asp?t=Motorola Type I - A�ROPORT DE DORVAL">Motorola</a>                </font>        </td>        <td width="458" align="left" valign="top" background="images/bleu_p.gif">                <font face="Arial" size="2">                        <b>Commercial</b>                         -                        A�ROPORT DE DORVAL                        <br />                        DORVAL -                        Qu�bec - Montr�al                </font>        </td>        <td width="131" align="center" background="images/bleu_p.gif">                <font face="Arial" size="2">                        <a href="info_id.asp?id=5027" target="_blank">                                <img border="0" src="images/info.gif" alt="Voir les informations suppl�mentaires pour cette entr�e" />                        </a>                        <a target="_blank" href="mod_id.asp?id=5027">                                <img border="0" src="images/modif2.gif" alt="Modifier cette entr�e" />                        </a>                        <a target="_blank" href="info_id.asp?id=5027#comm">                                <img border="0" src="images/commentaire.gif" width="14" height="14" alt="Commentaire disponible" />                        </a>                </font>        </td></tr>*/public class Scraper {        /* stateVec contains Integers :          * 0=Working;         * 1=Not sure;         * 2=Not working;         * */        public static ArrayList stateVec     = new ArrayList();        /*freqIdVec contains Strings describing the frequence or the ID         * If typeVec == 0, freqIdVec is a frequency. If typeVec != 0, freqIdVec is an ID.         */        public static ArrayList freqIdVec    = new ArrayList();        /* typeVec contains Integers :            -2=Not trunked, data+voice           -1=Not trunked, data            0=Not trunked, voice           Ericsson GE            1=EDACS Provoice            2=EDACS            3=GE Mark V           Logic Trunked Radio            4=LTR Standard            5=LTR Passport            6=LTR Standard and Passport            7=LTR MultiNet            8=LTR-Net           Motorola            9=Type I            10=Type II            11=Type IIi Hybrid            12=Type II SmartZone            13=Type II SmartZone OmniLink            14=iDEN (integrated Digital Enhanced Network)            15=Motorola Harmony (see iDEN)           16=MPT-1327           17=OpenSky System           18=APCO Project 16           19=APCO Project 25           20=SmarTrunk           21=TETRA           22=TETRAPOL           23=Kenwood NEXEDGE Digital trunked radio           24=Icom IDAS Digital trunked Land Mobile Radio           25=NXDN protocol         * */        public static ArrayList typeVec      = new ArrayList();        //serviceVec contains Strings describing the service (duh!)        public static ArrayList serviceVec   = new ArrayList();        //cityVec contains Strings describing the city.        public static ArrayList cityVec      = new ArrayList();                	public static String clean(String filename) throws IOException{            System.out.println("Beginning cleaning operation!");            FileReader input = new FileReader(filename);            BufferedReader bufRead = new BufferedReader(input);            String websiteSource = "";            System.out.println("+ Reading input file,this may take a while...");            String line = bufRead.readLine();            while( line != null){                    websiteSource += line+"\r\n";                    line = bufRead.readLine();            }            HtmlCleaner cleaner = new HtmlCleaner();            CleanerProperties props = cleaner.getProperties();            props.setOmitComments(true);            props.setUseEmptyElementTags(false);            System.out.println("+ Cleaning the file...");            TagNode node = cleaner.clean(websiteSource);            System.out.println("+ Saving the file "+filename+".clean.xml");            new PrettyXmlSerializer(props).writeToFile(node, filename+".clean.xml", "utf-8");            bufRead.close();            input.close();            return filename+".clean.xml";	}	        public static void parse(String cleanedFile) throws FileNotFoundException, IOException ,            ParserConfigurationException, SAXException, XPathExpressionException {            System.out.println("Beginning parsing operation!");            DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();            factory.setNamespaceAware(true);            DocumentBuilder builder;            Document doc = null;            XPathExpression expr = null;            builder = factory.newDocumentBuilder();            System.out.println("+ Reading cleaned file,this may take a while...");            doc =  (Document) builder.parse(new File(cleanedFile));            XPathFactory xFactory = XPathFactory.newInstance();            XPath xpath = xFactory.newXPath();                                    //state            expr = xpath.compile("//td[@width='53']/img | //td[@width='53']/p/img");            Object result = expr.evaluate(doc, XPathConstants.NODESET);            NodeList nodes = (NodeList) result;            System.out.println("+ State: Scanning "+nodes.getLength()+" nodes");            for (int i=0; i<nodes.getLength();i++){                NamedNodeMap attributes = nodes.item(i).getAttributes();                Node node = attributes.getNamedItem("alt");                String str = node.getNodeValue().replaceAll("[\\W&&[^\\x20\\x2D]]", "").trim();                if(str.equals("En fonction")){                    //its working                    System.out.println(i+": WORKS");                }else                if(str.equals("valider si encore en opration")){                    //maybe working                    System.out.println(i+": MAYBE works");                }else                if(str.equals("Plus en opration")){                    //not working                    System.out.println(i+": DONT works");                }            }                        //Freq            expr = xpath.compile("//td[@width='103']/font//text() | //td[@width='103']/font/b/text()");            result = expr.evaluate(doc, XPathConstants.NODESET);            nodes = (NodeList) result;            System.out.println("+ Freq/ID: Scanning "+nodes.getLength()+" nodes");            for (int i=0; i<nodes.getLength();i++){                    //Escape everything that is not a A-z , 0-9, an hyphen or a whitespace                String str = nodes.item(i).getNodeValue().replaceAll("[^\\d\\x2E\\x2D]", "");                if(str.length() > 1)                    System.out.println(i+": "+str);            }                                    /*            //service            expr = xpath.compile("//font/b/text()");            result = expr.evaluate(doc, XPathConstants.NODESET);            nodes = (NodeList) result;            System.out.println("+ Services: Scanning "+nodes.getLength()+" nodes");            for (int i=1; i<nodes.getLength();i+=2){                    //Escape everything that is not a A-z , 0-9, an hyphen or a whitespace                    System.out.println(nodes.item(i).getNodeValue().replaceAll("[\\W&&[^\\x20\\x2D]]", ""));            }                        //Trunking type if trunked            expr = xpath.compile("//font/a/@href");            result = expr.evaluate(doc, XPathConstants.NODESET);            nodes = (NodeList) result;            System.out.println("+ Services: Scanning "+nodes.getLength()+" nodes");            for (int i=1; i<nodes.getLength();i+=2){                //Escape everything that is not a A-z , 0-9, an hyphen or a whitespace                String str = nodes.item(i).getNodeValue().replaceAll("[\\W&&[^\\x20\\x2D]]", "");                if(str.length() > 1 && str.contains("trunk")){                    str = str.replace("recherche_rep_trunkaspt", "");                    System.out.println(str.substring(0, str.indexOf("-")-1));                }            }*/            /*// New XPath expression to get the number of people with name lars            expr = xpath.compile("count(//person[firstname='Lars'])");            // Run the query and get the number of nodes            Double number = (Double) expr.evaluate(doc, XPathConstants.NUMBER);            System.out.println("Number of objects " +number);            // Do we have more then 2 people with name lars?            expr = xpath.compile("count(//person[firstname='Lars']) >2");            // Run the query and get the number of nodes            Boolean check = (Boolean) expr.evaluate(doc, XPathConstants.BOOLEAN);            System.out.println(check);*/        }                ///Users/philippenadeau/Desktop/freq.html	public static void main(String[] args){            try {                if(args.length == 0){                    System.out.println("Usage: java -jar scraper.jar <HTML file absolute path>");                    System.exit(1);                }                String cleanedFile  = clean(args[0]);                parse(cleanedFile);            } catch (ParserConfigurationException ex) {                System.out.println("Wrong parsing configuration.");                System.exit(1);            } catch (SAXException ex) {                System.out.println("SAX problem.");                System.exit(1);            } catch (XPathExpressionException ex) {                System.out.println("Wrong XPath.");                System.exit(1);            }catch (FileNotFoundException e) {                System.out.println("Unable to find the file.");                System.exit(1);            } catch (IOException e) {                System.out.println("Problem reading/writing the file.");                System.exit(1);            }	}	}