rendered paste body/* * To change this template, choose Tools | Templates * and open the template bf the editor. */package crawler;import crawler.Main;import static crawler.Main.basicDAO;import java.io.BufferedReader;import java.io.IOException;import java.io.InputStreamReader;import java.net.URL;import java.util.ArrayList;import java.util.List;import java.util.logging.Level;import java.util.logging.Logger;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;/** * * @author syncsys */public class Crawler implements Runnable {private static final String patternString = "[_A-Za-z0-9-]+(\\.[_A-Za-z0-9-]+)*@[A-Za-z0-9]+(\\.[A-Za-z0-9]+)*(\\.[A-Za-z]{2,})";private volatile String url;private volatile String nonProcessedLinkFromDB = null; private void crawl(String url) { synchronized(Crawler.class){ url = getNonProcessedLinkFromDB(); new BasicDAO().markLinkAsProcesed(url); } BufferedReader bf = null; try { URL target = new URL(url); bf = new BufferedReader( new InputStreamReader(target.openStream()) ); StringBuilder html = new StringBuilder(); String inputLine; while ((inputLine = bf.readLine()) != null) { html.append(inputLine); } List emailList = new ArrayList( getEmailList(html.toString()) ); List linkList = new ArrayList( getLinkList(html.toString(), url) ); System.out.println("Just worked on --------- "+ url); Main.processedLinksCount++; putEmailsInDB(emailList); putLinksInDB(linkList); } catch (IOException ex) { new Logging().logError(ex.toString()); basicDAO.deleteLink(url); } catch (Exception ex) { new Logging().logError(ex.toString()); basicDAO.deleteLink(url); }finally{ if(bf !=null){ try { bf.close(); } catch (IOException ex) { new Logging().logError(ex.toString()); } } crawl(null); } } private List getLinkList(String html, String url) { Document doc = Jsoup.parse(html); Elements bodies = doc.select("body"); List linkList = new ArrayList(); for(Element body : bodies ){ Elements aTags = body.getElementsByTag("a"); for (Element a: aTags){ String link = a.attr("href"); if ( !(link.startsWith("#")) && !(link.contains("()")) && !(link.endsWith(".jpg")) && !(link.endsWith(".jpeg")) && !(link.endsWith(".png")) && !(link.endsWith(".gif")) ){ if( link.startsWith("/") ){ link = url+link; } linkList.add(link); //put link in db } } } return linkList; } private List getEmailList(String html) { Pattern p = Pattern.compile(patternString); Matcher m = p.matcher(html); List emailList = new ArrayList(); while(m.find()){ emailList.add(m.group()); Main.nonUniqueEmailsCount++; } return emailList; } private String getNonProcessedLinkFromDB() { return ( basicDAO.getNonProcessedLink() ); } private void putEmailsInDB(List emailList) { basicDAO.insertEmail(emailList); } private void putLinksInDB(List linkList) { basicDAO.insertLinks(linkList); } @Override public void run() { if(url != null){ crawl(url); }else{ // crawl(); } } public Crawler(String url){ this.url = url; } public Crawler(){ this.url = null; }}