rendered paste bodypackage scraper;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.*;
import org.htmlcleaner.*;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
public class Main {
public static final int WORKING = 0;
public static final int MAYBE_WORKING = 1;
public static final int NOT_WORKING = 2;
public static final int FREQUENCY = 0;
public static final int GROUP_ID = 1;
public static final int NOT_TRUNKED_DATA_VOICE = -2;
public static final int NOT_TRUNKED_DATA = -1;
public static final int NOT_TRUNKED_VOICE = 0;
public static final int EDACS_PROVOICE = 1;
public static final int EDACS = 2;
public static final int GE_MARK_V = 3;
public static final int LTR_STANDARD = 4;
public static final int LTR_PASSPORT = 5;
public static final int LTR_STANDARD_AND_PASSPORT = 6;
public static final int LTR_MULTINET = 7;
public static final int LTR_NET = 8;
public static final int MOTOROLA_TYPE_I = 9;
public static final int MOTOROLA_TYPE_II = 10;
public static final int MOTOROLA_TYPE_II_HYBRID = 11;
public static final int MOTOROLA_TYPE_II_SMARTZONE = 12;
public static final int MOTOROLA_TYPE_II_SMARTZONE_OMNILINK= 13;
public static final int MOTOROLA_IDEN = 14;
public static final int MOTOROLA_HARMONY = 15;
public static final int MPT_1327 = 16;
public static final int OPENSKY_SYSTEM = 17;
public static final int APCO_16 = 18;
public static final int APCO_25 = 19;
public static final int SMARTRUNK = 20;
public static final int TETRA = 21;
public static final int TETRAPOL = 22;
public static final int KENWOOD_NEXEDGE = 23;
public static final int ICOM_IDAS = 24;
public static final int NXDN = 25;
private static int lastItemProcessed = 0;
public static ArrayList<Entry> entriesDb = new ArrayList();
public static String clean(String filename) throws IOException{
System.out.println("Beginning cleaning operation!");
FileReader input = new FileReader(filename);
BufferedReader bufRead = new BufferedReader(input);
String websiteSource = "";
System.out.println("+ Reading input file,this may take a while...");
String line = bufRead.readLine();
while( line != null){
websiteSource += line+"\r\n";
line = bufRead.readLine();
}
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties props = cleaner.getProperties();
props.setOmitComments(true);
props.setUseEmptyElementTags(false);
System.out.println("+ Cleaning the file...");
TagNode node = cleaner.clean(websiteSource);
System.out.println("+ Saving the file "+filename+".clean.xml");
new PrettyXmlSerializer(props).writeToFile(node, filename+".clean.xml", "utf-8");
bufRead.close();
input.close();
return filename+".clean.xml";
}
public static void parse(String cleanedFile) throws FileNotFoundException, IOException ,
ParserConfigurationException, SAXException, XPathExpressionException {
int i=0,j=0;
System.out.println("Beginning parsing operation!");
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
factory.setNamespaceAware(true);
DocumentBuilder builder;
Document doc = null;
XPathExpression expr = null;
builder = factory.newDocumentBuilder();
System.out.println("+ Reading cleaned file,this may take a while...");
doc = (Document) builder.parse(new File(cleanedFile));
XPathFactory xFactory = XPathFactory.newInstance();
XPath xpath = xFactory.newXPath();
//state
expr = xpath.compile("//td[@width='53']/img | //td[@width='53']/p/img");
Object result = expr.evaluate(doc, XPathConstants.NODESET);
NodeList nodes = (NodeList) result;
entriesDb.ensureCapacity(nodes.getLength()+1);
for(i = entriesDb.size(); i<nodes.getLength();i++)
entriesDb.add(new Entry());
System.out.println("+ State: Scanning "+nodes.getLength()+" nodes");
for (i=0, j=0; i<nodes.getLength();i++,j++){
NamedNodeMap attributes = nodes.item(i).getAttributes();
Node node = attributes.getNamedItem("alt");
String str = node.getNodeValue().replaceAll("[\\W&&[^\\x20\\x2D]]", "").trim();
if(str.equals("En fonction")){
//System.out.println(i+": WORKS");
entriesDb.get(j).setState(WORKING);
}else
if(str.equals("valider si encore en opration")){
//System.out.println(i+": MAYBE works");
entriesDb.get(j).setState(MAYBE_WORKING);
}else
if(str.equals("Plus en opration")){
//System.out.println(i+": DONT works");
entriesDb.get(j).setState(NOT_WORKING);
}
}
System.out.println("++ State scanning finished.: XPath efficiency:"+(Double.parseDouble(String.valueOf(j))/Double.parseDouble(String.valueOf(i)))*100+"%");
//Freq
expr = xpath.compile("//td[@width='103']/font//text()");
result = expr.evaluate(doc, XPathConstants.NODESET);
nodes = (NodeList) result;
System.out.println("+ Freq/ID: Scanning "+nodes.getLength()+" nodes");
entriesDb.ensureCapacity(nodes.getLength()+1);
for(i = entriesDb.size(); i<nodes.getLength();i++)
entriesDb.add(new Entry());
for (i=0,j=0; i<nodes.getLength();i++){
//Escape everything that is not a digit, an hyphen or a whitespace
String str = nodes.item(i).getNodeValue().replaceAll("[^\\d\\x2E\\x2D]", "").trim();
if(str.length() > 1){
if(j !=0 && nodes.item(i).getParentNode().getParentNode().getTextContent().replaceAll("[^\\d\\x2E\\x2D]", "").trim().equals(nodes.item(lastItemProcessed).getParentNode().getParentNode().getTextContent().replaceAll("[^\\d\\x2E\\x2D]", "").trim())){
str = nodes.item(i).getNodeValue().replaceAll("[^\\d\\x2E\\x2D]", "").trim();
//check if not already set
if(entriesDb.get(j).getCtcss() == null){
entriesDb.get(j).setCtcss(str);
//System.out.println(" CTCSS: "+str);
}
}else{
//System.out.println(j+": "+str);
if(str.contains(".")){
entriesDb.get(j).setFreqORid(FREQUENCY);
entriesDb.get(j).setFreqid(str);
}
else{
entriesDb.get(j).setFreqORid(GROUP_ID);
entriesDb.get(j).setFreqid(str);
Node node = nodes.item(i).getParentNode().getParentNode().getLastChild().getPreviousSibling();
NamedNodeMap attributes = node.getAttributes();
Node namedItem = attributes.getNamedItem("href");
String temp = namedItem.getTextContent().replaceAll("[\\W&&[^\\x20\\x2D]]", "").replace("recherche_rep_trunkaspt", "").trim();
System.out.println();
}
j++;
}
lastItemProcessed = i;
}
}
System.out.println("++ Freq/ID scanning finished.: XPath efficiency:"+(Double.parseDouble(String.valueOf(j))/Double.parseDouble(String.valueOf(i)))*100+"%");
//service
expr = xpath.compile("//td[@width='458']/font//text()");
result = expr.evaluate(doc, XPathConstants.NODESET);
nodes = (NodeList) result;
System.out.println("+ Service: Scanning "+nodes.getLength()+" nodes");
entriesDb.ensureCapacity(nodes.getLength()+1);
for(i = entriesDb.size(); i<nodes.getLength();i++)
entriesDb.add(new Entry());
for (i=0,j=0; i<nodes.getLength();i+=4){
String str = nodes.item(i).getParentNode().getTextContent().replaceAll("[^\\w\\x2E\\x2D]", "").trim();
if(str.length() > 1){
//System.out.println(j+": "+str);
entriesDb.get(j).setService(str);
j++;
}
}
System.out.println("++ Service scanning finished.: XPath efficiency:"+(Double.parseDouble(String.valueOf(j))/Double.parseDouble(String.valueOf(i)))*100+"%");
/*
//Trunking type if trunked
expr = xpath.compile("//tr/td/font/a/@href");
result = expr.evaluate(doc, XPathConstants.NODESET);
nodes = (NodeList) result;
entriesDb.ensureCapacity(nodes.getLength()+1);
for(i = entriesDb.size(); i<nodes.getLength();i++)
entriesDb.add(new Entry());
System.out.println("+ Trunking type: Scanning "+nodes.getLength()+" nodes");
for (i=0,j=0; i<nodes.getLength();i++){
//Escape everything that is not a A-z , 0-9, an hyphen or a whitespace
String str = nodes.item(i).getNodeValue().replaceAll("[\\W&&[^\\x20\\x2D]]", "");
if(str.length() > 1 && str.contains("trunk")){
str = str.replace("recherche_rep_trunkaspt", "");
System.out.println(j+"|"+i+": "+str.substring(0, str.indexOf("-")-1));
j++;
}
}
System.out.println("++ Trunking type scanning finished.: XPath efficiency:"+(Double.parseDouble(String.valueOf(j))/Double.parseDouble(String.valueOf(i)))*100+"%");
*/
}
///Users/philippenadeau/Desktop/freq.html
public static void main(String[] args){
try {
if(args.length == 0){
System.out.println("Usage: java -jar scraper.jar <HTML file absolute path>");
System.exit(1);
}
String cleanedFile = clean(args[0]);
parse(cleanedFile);
} catch (ParserConfigurationException ex) {
System.out.println("Wrong parsing configuration.");
System.exit(1);
} catch (SAXException ex) {
System.out.println("SAX problem.");
System.exit(1);
} catch (XPathExpressionException ex) {
System.out.println("Wrong XPath.");
System.exit(1);
}catch (FileNotFoundException e) {
System.out.println("Unable to find the file.");
System.exit(1);
} catch (IOException e) {
System.out.println("Problem reading/writing the file.");
System.exit(1);
}
}
}
//-----------------
package scraper;
import java.util.ArrayList;
public class Entry {
public int freqORid,state,trunking;
public String service, freqid,ctcss;
public Entry(int state, String service, int trunking, int freqORid, String freqid, String ctcss){
this.freqORid = freqORid;
this.service = service;
this.state = state;
this.trunking = trunking;
this.ctcss = ctcss;
}
public Entry(){
this.freqORid = 0;
this.service = null;
this.state = 0;
this.trunking = 0;
this.ctcss = null;
}
public String getCtcss() {
return ctcss;
}
public void setCtcss(String ctcss) {
this.ctcss = ctcss;
}
public int getFreqORid() {
return freqORid;
}
public void setFreqORid(int freqORid) {
this.freqORid = freqORid;
}
public String getFreqid() {
return freqid;
}
public void setFreqid(String freqid) {
this.freqid = freqid;
}
public String getService() {
return service;
}
public void setService(String service) {
this.service = service;
}
public int getState() {
return state;
}
public void setState(int state) {
this.state = state;
}
public int getTrunking() {
return trunking;
}
public void setTrunking(int trunking) {
this.trunking = trunking;
}
}