All pastes #2467025 Raw Edit

Miscellany

public unlisted java v1 · immutable
#2467025 ·published 2013-10-15 17:16 UTC
rendered paste body
/* * To change this template, choose Tools | Templates * and open the template bf the editor. */package crawler;import crawler.Main;import static crawler.Main.basicDAO;import java.io.BufferedReader;import java.io.IOException;import java.io.InputStreamReader;import java.net.URL;import java.util.ArrayList;import java.util.List;import java.util.logging.Level;import java.util.logging.Logger;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;/** * * @author syncsys */public class Crawler implements Runnable {private static final String patternString = "[_A-Za-z0-9-]+(\\.[_A-Za-z0-9-]+)*@[A-Za-z0-9]+(\\.[A-Za-z0-9]+)*(\\.[A-Za-z]{2,})";private volatile String url;private volatile String nonProcessedLinkFromDB = null;           private void crawl(String url) {                        synchronized(Crawler.class){            url = getNonProcessedLinkFromDB();            new BasicDAO().markLinkAsProcesed(url);        }                BufferedReader bf = null;        try {                         URL target = new URL(url);            bf = new BufferedReader(                    new InputStreamReader(target.openStream())                 );            StringBuilder html = new StringBuilder();            String inputLine;            while ((inputLine = bf.readLine()) != null) {                html.append(inputLine);            }            List emailList = new ArrayList( getEmailList(html.toString()) );            List linkList = new ArrayList( getLinkList(html.toString(), url) );            System.out.println("Just worked on --------- "+ url);            Main.processedLinksCount++;            putEmailsInDB(emailList);             putLinksInDB(linkList);        } catch (IOException ex) {            new Logging().logError(ex.toString());            basicDAO.deleteLink(url);        } catch (Exception ex) {            new Logging().logError(ex.toString());             basicDAO.deleteLink(url);        }finally{            if(bf !=null){                try {                bf.close();                } catch (IOException ex) {                    new Logging().logError(ex.toString());                }                        }            crawl(null);        }    }        private  List getLinkList(String html, String url) {        Document doc = Jsoup.parse(html);        Elements bodies = doc.select("body");        List linkList =  new ArrayList();        for(Element body : bodies ){            Elements aTags = body.getElementsByTag("a");            for (Element a: aTags){               String link =  a.attr("href");               if ( !(link.startsWith("#"))                      &&                     !(link.contains("()"))                     &&                     !(link.endsWith(".jpg"))                      &&                     !(link.endsWith(".jpeg"))                       &&                     !(link.endsWith(".png"))                       &&                     !(link.endsWith(".gif"))     ){                                       if( link.startsWith("/") ){                        link = url+link;                    }                 linkList.add(link);                 //put link in db               }                }        }                return linkList;    }    private  List getEmailList(String html) {        Pattern p = Pattern.compile(patternString);        Matcher m = p.matcher(html);        List emailList = new ArrayList();        while(m.find()){            emailList.add(m.group());            Main.nonUniqueEmailsCount++;        }               return emailList;        }            private  String getNonProcessedLinkFromDB() {        return ( basicDAO.getNonProcessedLink() );    }    private  void putEmailsInDB(List emailList) {        basicDAO.insertEmail(emailList);    }    private  void putLinksInDB(List linkList) {       basicDAO.insertLinks(linkList);    }    @Override    public void run() {        if(url != null){            crawl(url);        }else{ //          crawl();        }            }    public Crawler(String url){        this.url = url;    }        public Crawler(){        this.url =  null;    }}