All pastes #2115497 Raw Edit

Mine

public text v1 · immutable
#2115497 ·published 2012-02-09 20:54 UTC
rendered paste body
package scraper;


import java.io.BufferedReader;
import java.io.File;

import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.*;

import org.htmlcleaner.*;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

public class Main {
        public static final int WORKING                            = 0;
        public static final int MAYBE_WORKING                      = 1;
        public static final int NOT_WORKING                        = 2;
        public static final int FREQUENCY                          = 0;
        public static final int GROUP_ID                           = 1;
        public static final int NOT_TRUNKED_DATA_VOICE             = -2;
        public static final int NOT_TRUNKED_DATA                   = -1;
        public static final int NOT_TRUNKED_VOICE                  = 0;
        public static final int EDACS_PROVOICE                     = 1;
        public static final int EDACS                              = 2;
        public static final int GE_MARK_V                          = 3;
        public static final int LTR_STANDARD                       = 4;
        public static final int LTR_PASSPORT                       = 5;
        public static final int LTR_STANDARD_AND_PASSPORT          = 6;
        public static final int LTR_MULTINET                       = 7;
        public static final int LTR_NET                            = 8;
        public static final int MOTOROLA_TYPE_I                    = 9;
        public static final int MOTOROLA_TYPE_II                   = 10;
        public static final int MOTOROLA_TYPE_II_HYBRID            = 11;
        public static final int MOTOROLA_TYPE_II_SMARTZONE         = 12;
        public static final int MOTOROLA_TYPE_II_SMARTZONE_OMNILINK= 13;
        public static final int MOTOROLA_IDEN                      = 14;
        public static final int MOTOROLA_HARMONY                   = 15;
        public static final int MPT_1327                           = 16;
        public static final int OPENSKY_SYSTEM                     = 17;
        public static final int APCO_16                            = 18;
        public static final int APCO_25                            = 19;
        public static final int SMARTRUNK                          = 20;
        public static final int TETRA                              = 21;
        public static final int TETRAPOL                           = 22;
        public static final int KENWOOD_NEXEDGE                    = 23;
        public static final int ICOM_IDAS                          = 24;
        public static final int NXDN                               = 25;
        private static int lastItemProcessed = 0;
        
        public static ArrayList<Entry> entriesDb = new ArrayList();
        
	public static String clean(String filename) throws IOException{
            System.out.println("Beginning cleaning operation!");
            FileReader input = new FileReader(filename);
            BufferedReader bufRead = new BufferedReader(input);
            String websiteSource = "";
            System.out.println("+ Reading input file,this may take a while...");
            String line = bufRead.readLine();
            while( line != null){
                    websiteSource += line+"\r\n";
                    line = bufRead.readLine();
            }
            HtmlCleaner cleaner = new HtmlCleaner();

            CleanerProperties props = cleaner.getProperties();
            props.setOmitComments(true);
            props.setUseEmptyElementTags(false);
            System.out.println("+ Cleaning the file...");
            TagNode node = cleaner.clean(websiteSource);

            System.out.println("+ Saving the file "+filename+".clean.xml");
            new PrettyXmlSerializer(props).writeToFile(node, filename+".clean.xml", "utf-8");
            bufRead.close();
            input.close();
            return filename+".clean.xml";
	}
	
        public static void parse(String cleanedFile) throws FileNotFoundException, IOException ,
            ParserConfigurationException, SAXException, XPathExpressionException {
            int i=0,j=0;
            System.out.println("Beginning parsing operation!");
            DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
            factory.setNamespaceAware(true);
            DocumentBuilder builder;
            Document doc = null;
            XPathExpression expr = null;
            builder = factory.newDocumentBuilder();
            System.out.println("+ Reading cleaned file,this may take a while...");
            doc =  (Document) builder.parse(new File(cleanedFile));
            XPathFactory xFactory = XPathFactory.newInstance();
            XPath xpath = xFactory.newXPath();
            
            
            //state
            expr = xpath.compile("//td[@width='53']/img | //td[@width='53']/p/img");
            Object result = expr.evaluate(doc, XPathConstants.NODESET);
            NodeList nodes = (NodeList) result;
            entriesDb.ensureCapacity(nodes.getLength()+1);
            for(i = entriesDb.size(); i<nodes.getLength();i++)
                entriesDb.add(new Entry()); 
            System.out.println("+ State: Scanning "+nodes.getLength()+" nodes");
            for (i=0, j=0; i<nodes.getLength();i++,j++){
                NamedNodeMap attributes = nodes.item(i).getAttributes();
                Node node = attributes.getNamedItem("alt");
                String str = node.getNodeValue().replaceAll("[\\W&&[^\\x20\\x2D]]", "").trim();
                if(str.equals("En fonction")){
                    //System.out.println(i+": WORKS");
                    entriesDb.get(j).setState(WORKING);
                }else
                if(str.equals("valider si encore en opration")){
                    //System.out.println(i+": MAYBE works");
                    entriesDb.get(j).setState(MAYBE_WORKING);
                }else
                if(str.equals("Plus en opration")){
                    //System.out.println(i+": DONT works");
                    entriesDb.get(j).setState(NOT_WORKING);
                }
            }
            System.out.println("++ State scanning finished.: XPath efficiency:"+(Double.parseDouble(String.valueOf(j))/Double.parseDouble(String.valueOf(i)))*100+"%");

            //Freq
            expr = xpath.compile("//td[@width='103']/font//text()");
            result = expr.evaluate(doc, XPathConstants.NODESET);
            nodes = (NodeList) result;
            System.out.println("+ Freq/ID: Scanning "+nodes.getLength()+" nodes");
            entriesDb.ensureCapacity(nodes.getLength()+1);
            for(i = entriesDb.size(); i<nodes.getLength();i++)
                entriesDb.add(new Entry());
            for (i=0,j=0; i<nodes.getLength();i++){
                //Escape everything that is not a digit, an hyphen or a whitespace
                String str = nodes.item(i).getNodeValue().replaceAll("[^\\d\\x2E\\x2D]", "").trim();
                if(str.length() > 1){
                    if(j !=0 && nodes.item(i).getParentNode().getParentNode().getTextContent().replaceAll("[^\\d\\x2E\\x2D]", "").trim().equals(nodes.item(lastItemProcessed).getParentNode().getParentNode().getTextContent().replaceAll("[^\\d\\x2E\\x2D]", "").trim())){
                        str = nodes.item(i).getNodeValue().replaceAll("[^\\d\\x2E\\x2D]", "").trim();
                        //check if not already set
                        if(entriesDb.get(j).getCtcss() == null){
                            entriesDb.get(j).setCtcss(str);
                            //System.out.println("   CTCSS: "+str);
                        }
                    }else{
                        //System.out.println(j+": "+str);
                        if(str.contains(".")){
                            entriesDb.get(j).setFreqORid(FREQUENCY);
                            entriesDb.get(j).setFreqid(str);
                        }
                        else{
                            entriesDb.get(j).setFreqORid(GROUP_ID);
                            entriesDb.get(j).setFreqid(str);
                            Node node = nodes.item(i).getParentNode().getParentNode().getLastChild().getPreviousSibling();
                            NamedNodeMap attributes = node.getAttributes();
                            Node namedItem = attributes.getNamedItem("href");
                            String temp = namedItem.getTextContent().replaceAll("[\\W&&[^\\x20\\x2D]]", "").replace("recherche_rep_trunkaspt", "").trim();
                            
                            System.out.println();
                        }
                        j++;
                    }
                    lastItemProcessed = i;
                }           
            }
            System.out.println("++ Freq/ID scanning finished.: XPath efficiency:"+(Double.parseDouble(String.valueOf(j))/Double.parseDouble(String.valueOf(i)))*100+"%");
            
            //service
            expr = xpath.compile("//td[@width='458']/font//text()");
            result = expr.evaluate(doc, XPathConstants.NODESET);
            nodes = (NodeList) result;
            System.out.println("+ Service: Scanning "+nodes.getLength()+" nodes");
            entriesDb.ensureCapacity(nodes.getLength()+1);
            for(i = entriesDb.size(); i<nodes.getLength();i++)
                entriesDb.add(new Entry());
            for (i=0,j=0; i<nodes.getLength();i+=4){
                String str = nodes.item(i).getParentNode().getTextContent().replaceAll("[^\\w\\x2E\\x2D]", "").trim();
                if(str.length() > 1){
                    //System.out.println(j+": "+str);
                    entriesDb.get(j).setService(str);
                    j++;
                }
            }
            System.out.println("++ Service scanning finished.: XPath efficiency:"+(Double.parseDouble(String.valueOf(j))/Double.parseDouble(String.valueOf(i)))*100+"%");
            
            /*
            //Trunking type if trunked
            expr = xpath.compile("//tr/td/font/a/@href");
            result = expr.evaluate(doc, XPathConstants.NODESET);
            nodes = (NodeList) result;
            entriesDb.ensureCapacity(nodes.getLength()+1);
            for(i = entriesDb.size(); i<nodes.getLength();i++)
                entriesDb.add(new Entry());
            System.out.println("+ Trunking type: Scanning "+nodes.getLength()+" nodes");
            for (i=0,j=0; i<nodes.getLength();i++){
                //Escape everything that is not a A-z , 0-9, an hyphen or a whitespace
                String str = nodes.item(i).getNodeValue().replaceAll("[\\W&&[^\\x20\\x2D]]", "");
                if(str.length() > 1 && str.contains("trunk")){
                    str = str.replace("recherche_rep_trunkaspt", "");
                    System.out.println(j+"|"+i+": "+str.substring(0, str.indexOf("-")-1));
                    
                    j++;
                }
            }
            System.out.println("++ Trunking type scanning finished.: XPath efficiency:"+(Double.parseDouble(String.valueOf(j))/Double.parseDouble(String.valueOf(i)))*100+"%");
*/
        }
        
        ///Users/philippenadeau/Desktop/freq.html
	public static void main(String[] args){
            try {
                if(args.length == 0){
                    System.out.println("Usage: java -jar scraper.jar <HTML file absolute path>");
                    System.exit(1);
                }
                String cleanedFile  = clean(args[0]);
                parse(cleanedFile);
            } catch (ParserConfigurationException ex) {
                System.out.println("Wrong parsing configuration.");
                System.exit(1);
            } catch (SAXException ex) {
                System.out.println("SAX problem.");
                System.exit(1);
            } catch (XPathExpressionException ex) {
                System.out.println("Wrong XPath.");
                System.exit(1);
            }catch (FileNotFoundException e) {
                System.out.println("Unable to find the file.");
                System.exit(1);
            } catch (IOException e) {
                System.out.println("Problem reading/writing the file.");
                System.exit(1);
            }
	}
	
}


//-----------------



package scraper;

import java.util.ArrayList;


public class Entry {

    
    public int freqORid,state,trunking;
    public String service, freqid,ctcss;
    
    public Entry(int state, String service, int trunking, int freqORid, String freqid, String ctcss){
        this.freqORid   = freqORid;
        this.service    = service;
        this.state      = state;
        this.trunking   = trunking;
        this.ctcss      = ctcss;
    }
    
    public Entry(){
        this.freqORid   = 0;
        this.service    = null;
        this.state      = 0;
        this.trunking   = 0;
        this.ctcss      = null;
    }

    public String getCtcss() {
        return ctcss;
    }

    public void setCtcss(String ctcss) {
        this.ctcss = ctcss;
    }

    public int getFreqORid() {
        return freqORid;
    }

    public void setFreqORid(int freqORid) {
        this.freqORid = freqORid;
    }

    public String getFreqid() {
        return freqid;
    }

    public void setFreqid(String freqid) {
        this.freqid = freqid;
    }

    public String getService() {
        return service;
    }

    public void setService(String service) {
        this.service = service;
    }

    public int getState() {
        return state;
    }

    public void setState(int state) {
        this.state = state;
    }

    public int getTrunking() {
        return trunking;
    }

    public void setTrunking(int trunking) {
        this.trunking = trunking;
    }
    
    
        
}