All pastes #1178939 Raw Copy code Copy link Edit

crawl.rb

public ruby v1 · immutable
#1178939 ·published 2008-08-19 19:43 UTC
rendered paste body
#!/usr/bin/ruby#All the requirementsrequire "cgi"require "net/http"require "uri"require "date"def cutlink(href)	h_array = href.split(//)	link = "";	inlink = false;	0.upto(h_array.length-1) { |i|		if(inlink) 		then			if(h_array[i] == "\"") 			then				inlink = false;				break;			end			if(h_array[i] == "\n") then next end						if(h_array[i] == "'") then next end			link += h_array[i]			next		end		if(h_array[i] == "\"") 		then			inlink = true		end	}	return linkenddef cutlinks(body)    linkarray = ""    bodyarray = body.split(/ /)    0.upto(bodyarray.length-1) { |i|    	if(bodyarray[i].to_s[0..3].downcase == "href")    	then    		#the @ character is used to split this into an array of links    		linkarray += cutlink(bodyarray[i]).downcase+"@"    	end    }    return linkarrayenddef sortlinks(links)    newlinks = []    linkcount = 0    0.upto(links.length-1) { |i|    	if(links[i].include? "search?q=cache:") then next end	#if it's part of google caching system    	if(links[i][0..0] == "/") then next end					#if it's a local link        if(links[i].include? "google") then next end			#if it's something from google (mail, docs, etc)        if(links[i].include? "\n") then next end				#links shouldn't have newline characters    	if(links[i].include? "froogle") then next end			#links shouldn't be store items    	if(links[i].include? "Froogle") then next end			#links shouldn't be store items    	if(links[i].include? "cm_mmc=seo-_-feeds") then next end    	if(links[i].include? "youtube.com/results?q") then next end #links shouldn't be youtube searches    	if(links[i] == nil) then next end						#if the link is null        if(links[i].include? " ") then next end					#links shouldn't have spaces        if(links[i][7..9] == "209") then next end				#these were used when I manually set google's cache servers        if(links[i][7..9] == "64.") then next end 				#changed in favor of the "search?q=cache:"        if(links[i][7..9] == "72.") then next end        if(links[i][0..4] == "https") then next end				#Links shouldn't use secure socket layer        newlinks[linkcount] = links[i]							#if it passes the above, it's added to the link array        linkcount += 1    }    return newlinksenddef google(term)	url="http://www.google.com/search?q="+term+"&num=10&hl=en&lr=&as_qdr=all&start=0&sa=N"	body = ""    #Iterates through the url for the specified searches    #may unroll this to default to all (1000) results    #may also thread this to speed it up a bit	request = Net::HTTP.get_response(URI.parse(url))	body = request.body    return bodyend#Program start#host = "localhost"#user = "minehowe_default"#pass = "babyoil"#db = "minehowe_keywords"#cgi = CGI.new#fname = cgi['filename']fname = "keywordsforben.txt"domain = "wikipedia.org"source = File.open("../uploads/"+fname, "r")sink = File.open("output.csv","w")terms = []i = 0;term = source.getswhile term != nil	terms[i] = term	i += 1	term = source.getsend0.upto(terms.length-1){ |j|	terms[j].chomp	search_array = terms[j].split(//)	terms[j] = ""	0.upto(search_array.length-1) { |i|		if(search_array[i] == " ") then			terms[j] += "+"					#this adds the required "+" char in between the terms			next		end		terms[j] += search_array[i]	}}bodys = []0.upto(terms.length-1) { |j|	bodys[j] = google(terms[j])}links = []print "Content-type: text/html\n\n"0.upto(terms.length-1) { |j|	links[j] = cutlinks(bodys[j])	links[j] = links[j].split(/@/)	links[j] = sortlinks(links[j])	links[j].uniq!}final = []place = 00.upto(terms.length-1) { |j|	linkarray = links[j]	flink = "not found"	9.downto(0) {|k|		if(linkarray[k] != nil) then				if(linkarray[k].include? domain) then				flink = linkarray[k]				place = k+1			end		end	}	final[j] = terms[j].chomp+","+flink+","+place.to_s#	print flink+"<br />\n"#	formatted = terms[j]+","+flink+","+place.to_s#	print formatted+"<br />"}0.upto(terms.length-1) { |j|	print final[j]+"\n"}