用ruby写的一个网络爬虫程序

Apr 22 2009

前几天写的一个ruby爬虫,专抓指定网站的Email

require 'net/http'
module EmailSpider
  EMAIL=/\w+([-+.]\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*/i
  #the fullest uri like this
  #http://www.sina.com.cn:80/abc?query='123'&page='http://eee.sss.vv:8080'
  URL=/http:\/\/\w+(\.\w+)*([:]?\d*)?[\/]?[0-9a-zA-Z\/=\?%#\-\&_~\`@\[\]\':\+!\.]*/i
  NON_HTML_URL=/(\.dtd)|(\.css)|(\.js)|(\.pdf)|(\.png)|(\.jpg)|(\.ps)|(\(.*\)[;]?[ ]*)$/i
  GOOGLE_URL=/http:\/\/\w+\.google\./i
  DOMAIN=/http:\/\/\w+(\.\w+)*([:]?\d*)?[\/]?/i
  class EmailSet
    attr_accessor :emails,:url
    def initialize(emails,url)
      @emails,@url=emails,url
    end
  end
  def self.query_by_url(url)
    url+="/" if (not url=~/[\/]$/)and(not url=~/(\.htm.?)|(\.jsp)|(\.asp)|(\.php)$/i)
    url.strip!
    begin
      if url=~URL
        content=Net::HTTP.get(URI.parse(url.strip));
        content.gsub!(/<.?frame .*src="(.+)".*>/i) do |match| 
          sub_url=match.scan(/src=[\\'](.+)[\\']/i)
          if sub_url=~URL and not url=~GOOGLE_URL
            query_by_url sub_url
          else
            query_by_url "#{url}#{sub_url}"
          end
        end
        return content
      end
    rescue
      #puts "missing the page of #{url}"
      return ""
    end
  end
  def self.find_email_from_html(html)
    emails=[]
    html.gsub(EMAIL){|match| emails<<match}
    return emails
  end
  def self.find_url_from_html(html,url=nil)
    url+="/" if (not url=~/\/$/)and(not url=~/htm.?/i)
    urls=[]
    html.gsub(URL) do |match|
      match=match.split("\"")[0]
      if(not (match=~NON_HTML_URL or match=~GOOGLE_URL))
        urls<<match
      end
    end
    html.scan(/<a .*href=[\\']([^<>]+)[\\'][ ]*>/i) do |matchs|
      matchs.each do |match|
        if (not (match=~URL or match=~NON_HTML_URL or match=~/mailto:/i or match=~/news:/i or match=~GOOGLE_URL))
          if match=~/^[\/]/
            urls<<"#{url[DOMAIN]}#{match.gsub(/^[\/]/,"")}"
          else
            urls<<"#{url}#{match}"  
          end
 
        end
      end
    end
    return urls
  end
 
  def self.find_email_in_depth(urls_in_depth,depth,page_num,crawled_urls,max_depth,max_page,&block)
    return if urls_in_depth.size<=0
    #puts "the depth is #{depth} and the page num is #{urls_in_depth.size}"
    crawled_urls_in_this_depth=[]
    urls_in_next_depth=[]
    urls_in_depth.uniq!
    urls_in_depth.each do |url|
      if page_num<max_page
        unless crawled_urls.include?(url)
          html=query_by_url(url)
          block.call(find_email_from_html(html).uniq,url)
          page_num+=1
          crawled_urls_in_this_depth<<url
          urls_in_next_depth+=find_url_from_html(html,url)
          #puts "the #{url} url_num is #{page_num}  and the next depth urls size is #{urls_in_next_depth.size}"
        else
          urls_in_depth.delete(url)
          next
        end
      else
        return
      end
    end
    depth+=1
    crawled_urls_in_this_depth+=urls_in_depth
    if depth<=max_depth
      find_email_in_depth((urls_in_next_depth).uniq,depth,page_num,crawled_urls,max_depth,max_page){|em,url|block.call(em,url)}
    else
      return
    end
  end
  def self.find_email_online(start_url,max_depth=0,max_pages=10)
    result_set=[]
    crawled_urls=[]
    start_page=query_by_url(start_url)
    crawled_urls<<start_url
    result_set<<EmailSet.new(find_email_from_html(start_page),start_url)
    find_email_in_depth(find_url_from_html(start_page,start_url).uniq,1,2,crawled_urls,max_depth,max_pages) do |em,url| 
      result_set<<EmailSet.new(em,url) if em.size>0 
    end if max_depth>0
    #print the result
    result_set.each do |es|      
      puts "#{es.emails.join(",")},#{es.url}" if es.emails.size>0
    end
  end
end
begin
  #start_url='http://www-cs-students.stanford.edu/~dbyang/'
  start_url='http://www-cs.stanford.edu/People'
  EmailSpider.find_email_online(start_url,2,1000)
  puts "----------------------------------------"
  EmailSpider.find_email_online(start_url,3,1000)
  puts "----------------------------------------"
  EmailSpider.find_email_online(start_url,4,1000)
  puts "----------------------------------------"
  EmailSpider.find_email_online(start_url,5,1000)
  puts "----------------------------------------"
end
Tags:

No responses yet

Leave a Reply