rendered paste body/* * To change this template, choose Tools | Templates * and open the template bf the editor. */package crawler;import crawler.Main;import java.io.BufferedReader;import java.io.IOException;import java.io.InputStreamReader;import java.net.URL;import java.util.ArrayList;import java.util.List;import java.util.logging.Level;import java.util.logging.Logger;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;/** * * @author syncsys */public class Crawler implements Runnable {private static final String patternString = "[_A-Za-z0-9-]+(\\.[_A-Za-z0-9-]+)*@[A-Za-z0-9]+(\\.[A-Za-z0-9]+)*(\\.[A-Za-z]{2,})";private volatile String url;private volatile String nonProcessedLinkFromDB = null; private void crawl(String url) { // synchronized(Crawler.class){// boolean markedLinkAsProcessedBoolean = new BasicDAO().markLinkAsProcesed(url);// nonProcessedLinkFromDB = url;// if(markedLinkAsProcessedBoolean){// nonProcessedLinkFromDB = getNonProcessedLinkFromDB();// // System.out.println("Link marked processed in db "+url);//// }// } synchronized(Crawler.class){ url = getNonProcessedLinkFromDB();// System.out.println("Just fetched up from db +++++++ " +url); new BasicDAO().markLinkAsProcesed(url); } // System.out.println("debug-------------------------------- 1"); BufferedReader bf = null; try {// System.out.println("debug-------------------------------- 2"); URL target = new URL(url); bf = new BufferedReader( new InputStreamReader(target.openStream()) );// System.out.println("debug ========= 1"); StringBuilder html = new StringBuilder(); String inputLine; while ((inputLine = bf.readLine()) != null) { // System.out.println(inputLine); html.append(inputLine); } List emailList = new ArrayList( getEmailList(html.toString()) ); List linkList = new ArrayList( getLinkList(html.toString(), url) ); System.out.println("Just worked on --------- "+ url);// boolean markedLinkAsProcessedBoolean = new BasicDAO().markLinkAsProcesed(url);// if(markedLinkAsProcessedBoolean){//// System.out.println("Link marked processed in db "+url);// } Main.processedLinksCount++; putEmailsInDB(emailList);// putLinksInDB(linkList);// System.out.println("debug-------------------------------- 4");// System.out.println("debug ========= 2"); } catch (IOException ex) { new Logging().logError(ex.toString()); new BasicDAO().deleteLink(url);// System.out.println("debug-------------------------------- link deleted"); } catch (Exception ex) { new Logging().logError(ex.toString()); new BasicDAO().deleteLink(url);// System.out.println("debug-------------------------------- link deleted"); }finally{ if(bf !=null){ try { bf.close(); // System.out.println("debug ========= 3"); } catch (IOException ex) { new Logging().logError(ex.toString()); } } // System.out.println("debug ========= 4"); // System.out.println("debug-------------------------------- 5");// System.out.println("fetched non-processed link from db: ++++++++++++++++ "+ nonProcessedLinkFromDB); crawl(null); // System.out.println("nonePlinkfromDB is ++++++++++++++++++++++++++" + nonProcessedLinkFromDB); // System.out.println("debug ========= 5"); } /// String line = "kj asdkfj a;sdlfkj <p>.mydomain@domain.com</p> asdkfja sdlfkj myot1he-rD.o_main@domain.com.com.com ads"; } private List getLinkList(String html, String url) { Document doc = Jsoup.parse(html); Elements bodies = doc.select("body"); List linkList = new ArrayList(); for(Element body : bodies ){ Elements aTags = body.getElementsByTag("a"); for (Element a: aTags){ String link = a.attr("href"); if ( !(link.startsWith("#")) && !(link.contains("()")) && !(link.endsWith(".jpg")) && !(link.endsWith(".jpeg")) && !(link.endsWith(".png")) && !(link.endsWith(".gif")) ){ if( link.startsWith("/") ){ link = url+link; } linkList.add(link); //put link in db } } } return linkList; } private List getEmailList(String html) { Pattern p = Pattern.compile(patternString); Matcher m = p.matcher(html); List emailList = new ArrayList(); while(m.find()){ emailList.add(m.group()); Main.nonUniqueEmailsCount++; } return emailList; } private String getNonProcessedLinkFromDB() { return ( new BasicDAO().getNonProcessedLink() ); } private void putEmailsInDB(List emailList) { new BasicDAO().insertEmail(emailList); } private void putLinksInDB(List linkList) { new BasicDAO().insertLinks(linkList); } @Override public void run() { if(url != null){ crawl(url); }else{ // crawl(); } } public Crawler(String url){ this.url = url; } public Crawler(){ this.url = null; }}