用ruby写的一个网络爬虫程序
前几天写的一个ruby爬虫,专抓指定网站的Email
require 'net/http' module EmailSpider EMAIL=/\w+([-+.]\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*/i #the fullest uri like this #http://www.sina.com.cn:80/abc?query='123'&page='http://eee.sss.vv:8080' URL=/http:\/\/\w+(\.\w+)*([:]?\d*)?[\/]?[0-9a-zA-Z\/=\?%#\-\&_~\`@\[\]\':\+!\.]*/i NON_HTML_URL=/(\.dtd)|(\.css)|(\.js)|(\.pdf)|(\.png)|(\.jpg)|(\.ps)|(\(.*\)[;]?[ ]*)$/i GOOGLE_URL=/http:\/\/\w+\.google\./i DOMAIN=/http:\/\/\w+(\.\w+)*([:]?\d*)?[\/]?/i class EmailSet attr_accessor :emails,:url def initialize(emails,url) @emails,@url=emails,url end end def self.query_by_url(url) url+="/" if (not url=~/[\/]$/)and(not url=~/(\.htm.?)|(\.jsp)|(\.asp)|(\.php)$/i) url.strip! begin if url=~URL content=Net::HTTP.get(URI.parse(url.strip)); content.gsub!(/<.?frame .*src="(.+)".*>/i) do |match| sub_url=match.scan(/src=[\\'](.+)[\\']/i) if sub_url=~URL and not url=~GOOGLE_URL query_by_url sub_url else query_by_url "#{url}#{sub_url}" end end return content end rescue #puts "missing the page of #{url}" return "" end end def self.find_email_from_html(html) emails=[] html.gsub(EMAIL){|match| emails<<match} return emails end def self.find_url_from_html(html,url=nil) url+="/" if (not url=~/\/$/)and(not url=~/htm.?/i) urls=[] html.gsub(URL) do |match| match=match.split("\"")[0] if(not (match=~NON_HTML_URL or match=~GOOGLE_URL)) urls<<match end end html.scan(/<a .*href=[\\']([^<>]+)[\\'][ ]*>/i) do |matchs| matchs.each do |match| if (not (match=~URL or match=~NON_HTML_URL or match=~/mailto:/i or match=~/news:/i or match=~GOOGLE_URL)) if match=~/^[\/]/ urls<<"#{url[DOMAIN]}#{match.gsub(/^[\/]/,"")}" else urls<<"#{url}#{match}" end end end end return urls end def self.find_email_in_depth(urls_in_depth,depth,page_num,crawled_urls,max_depth,max_page,&block) return if urls_in_depth.size<=0 #puts "the depth is #{depth} and the page num is #{urls_in_depth.size}" crawled_urls_in_this_depth=[] urls_in_next_depth=[] urls_in_depth.uniq! urls_in_depth.each do |url| if page_num<max_page unless crawled_urls.include?(url) html=query_by_url(url) block.call(find_email_from_html(html).uniq,url) page_num+=1 crawled_urls_in_this_depth<<url urls_in_next_depth+=find_url_from_html(html,url) #puts "the #{url} url_num is #{page_num} and the next depth urls size is #{urls_in_next_depth.size}" else urls_in_depth.delete(url) next end else return end end depth+=1 crawled_urls_in_this_depth+=urls_in_depth if depth<=max_depth find_email_in_depth((urls_in_next_depth).uniq,depth,page_num,crawled_urls,max_depth,max_page){|em,url|block.call(em,url)} else return end end def self.find_email_online(start_url,max_depth=0,max_pages=10) result_set=[] crawled_urls=[] start_page=query_by_url(start_url) crawled_urls<<start_url result_set<<EmailSet.new(find_email_from_html(start_page),start_url) find_email_in_depth(find_url_from_html(start_page,start_url).uniq,1,2,crawled_urls,max_depth,max_pages) do |em,url| result_set<<EmailSet.new(em,url) if em.size>0 end if max_depth>0 #print the result result_set.each do |es| puts "#{es.emails.join(",")},#{es.url}" if es.emails.size>0 end end end begin #start_url='http://www-cs-students.stanford.edu/~dbyang/' start_url='http://www-cs.stanford.edu/People' EmailSpider.find_email_online(start_url,2,1000) puts "----------------------------------------" EmailSpider.find_email_online(start_url,3,1000) puts "----------------------------------------" EmailSpider.find_email_online(start_url,4,1000) puts "----------------------------------------" EmailSpider.find_email_online(start_url,5,1000) puts "----------------------------------------" end