All pastes #2466620 Raw Edit

Miscellany

public unlisted java v1 · immutable
#2466620 ·published 2013-10-14 15:28 UTC
rendered paste body
/* * To change this template, choose Tools | Templates * and open the template bf the editor. */package crawler;import crawler.Main;import java.io.BufferedReader;import java.io.IOException;import java.io.InputStreamReader;import java.net.URL;import java.util.ArrayList;import java.util.List;import java.util.logging.Level;import java.util.logging.Logger;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;/** * * @author syncsys */public class Crawler implements Runnable {private static final String patternString = "[_A-Za-z0-9-]+(\\.[_A-Za-z0-9-]+)*@[A-Za-z0-9]+(\\.[A-Za-z0-9]+)*(\\.[A-Za-z]{2,})";private volatile String url;private volatile String nonProcessedLinkFromDB = null;           private void crawl(String url) {        //        synchronized(Crawler.class){//            boolean markedLinkAsProcessedBoolean = new BasicDAO().markLinkAsProcesed(url);//            nonProcessedLinkFromDB = url;//            if(markedLinkAsProcessedBoolean){//              nonProcessedLinkFromDB = getNonProcessedLinkFromDB();//    //                System.out.println("Link marked processed in db "+url);////            }//        }                synchronized(Crawler.class){            url = getNonProcessedLinkFromDB();//            System.out.println("Just fetched up from db +++++++ " +url);            new BasicDAO().markLinkAsProcesed(url);        }        //        System.out.println("debug-------------------------------- 1");                   BufferedReader bf = null;        try {//        System.out.println("debug-------------------------------- 2");                             URL target = new URL(url);            bf = new BufferedReader(                    new InputStreamReader(target.openStream())                 );// System.out.println("debug ========= 1");            StringBuilder html = new StringBuilder();            String inputLine;            while ((inputLine = bf.readLine()) != null) {      //            System.out.println(inputLine);                html.append(inputLine);                            }            List emailList = new ArrayList( getEmailList(html.toString()) );            List linkList = new ArrayList( getLinkList(html.toString(), url) );            System.out.println("Just worked on --------- "+ url);//             boolean markedLinkAsProcessedBoolean = new BasicDAO().markLinkAsProcesed(url);//            if(markedLinkAsProcessedBoolean){////                System.out.println("Link marked processed in db "+url);//            }            Main.processedLinksCount++;            putEmailsInDB(emailList);//            putLinksInDB(linkList);//            System.out.println("debug-------------------------------- 4");// System.out.println("debug ========= 2");                                                     } catch (IOException ex) {            new Logging().logError(ex.toString());            new BasicDAO().deleteLink(url);//            System.out.println("debug-------------------------------- link deleted");        } catch (Exception ex) {            new Logging().logError(ex.toString());             new BasicDAO().deleteLink(url);//            System.out.println("debug-------------------------------- link deleted");        }finally{            if(bf !=null){                try {                bf.close(); // System.out.println("debug ========= 3");                } catch (IOException ex) {                    new Logging().logError(ex.toString());                }                        } //  System.out.println("debug ========= 4");              //            System.out.println("debug-------------------------------- 5");//            System.out.println("fetched non-processed link from db: ++++++++++++++++ "+ nonProcessedLinkFromDB);            crawl(null);             //           System.out.println("nonePlinkfromDB is ++++++++++++++++++++++++++" + nonProcessedLinkFromDB);                          // System.out.println("debug ========= 5");                                         } ///       String line = "kj asdkfj a;sdlfkj <p>.mydomain@domain.com</p> asdkfja sdlfkj myot1he-rD.o_main@domain.com.com.com ads";    }        private  List getLinkList(String html, String url) {        Document doc = Jsoup.parse(html);        Elements bodies = doc.select("body");        List linkList =  new ArrayList();        for(Element body : bodies ){            Elements aTags = body.getElementsByTag("a");            for (Element a: aTags){               String link =  a.attr("href");               if ( !(link.startsWith("#"))                      &&                     !(link.contains("()"))                     &&                     !(link.endsWith(".jpg"))                      &&                     !(link.endsWith(".jpeg"))                       &&                     !(link.endsWith(".png"))                       &&                     !(link.endsWith(".gif"))     ){                                       if( link.startsWith("/") ){                        link = url+link;                    }                 linkList.add(link);                 //put link in db               }                }        }                return linkList;    }    private  List getEmailList(String html) {        Pattern p = Pattern.compile(patternString);        Matcher m = p.matcher(html);        List emailList = new ArrayList();        while(m.find()){            emailList.add(m.group());            Main.nonUniqueEmailsCount++;        }               return emailList;        }            private  String getNonProcessedLinkFromDB() {        return ( new BasicDAO().getNonProcessedLink() );    }    private  void putEmailsInDB(List emailList) {        new BasicDAO().insertEmail(emailList);    }    private  void putLinksInDB(List linkList) {       new BasicDAO().insertLinks(linkList);    }    @Override    public void run() {        if(url != null){            crawl(url);        }else{ //           crawl();        }            }    public Crawler(String url){        this.url = url;    }        public Crawler(){        this.url =  null;    }}