friend.rb - yasuhisa's blog

followersとかfollowingとかの人の情報を持っておくためのクラス。何か長いようですが、フラグを付けるためにいろんなメソッドがいろいろあるだけで難しいことはやってません。Hpricotとかで必要な要素を取ってきてます。例外処理とかよく分からない。あと、クラスのメソッドとして定義するべきかとかの辺もよく分からない。
#!/usr/bin/ruby -Ke

require 'rubygems'
require 'hpricot'
require 'mechanize'
require 'kconv'
require 'yaml'
require 'MeCab'
require 'twitter'

$KCODE='e'

#friendの情報を持っておくためのクラス
class Friend
  attr_accessor :name
  attr_accessor :source
  attr_accessor :agent
  attr_accessor :flag
  attr_accessor :following
  attr_accessor :followers
  attr_accessor :favorites
  attr_accessor :update
  attr_accessor :adr
  attr_accessor :web
  attr_accessor :bio
  attr_accessor :entry
  attr_accessor :time_diff
  attr_accessor :time_mean
  attr_accessor :entry_size
  attr_accessor :blog_info
  attr_accessor :blog_url
  attr_accessor :is_program_by_bayes
  attr_accessor :is_otaku_in_entry_by_bayes
  attr_accessor :is_otaku_in_bio_by_bayes
  attr_accessor :at_mark
  attr_accessor :is_japanese
  def initialize(twitter)
    #protectかかってる人の発言はログインしないととれないので、ログインしたagentを渡す
    @name = twitter[:name]

    @agent = twitter[:agent]
    @url = "http://twitter.com/"+@name
    @flag = twitter[:flag]
    file_name = "./twitter/"+@name+".html"
    rewrite = twitter[:rewrite]

    if rewrite == 1
      begin
        file = File.open(file_name,'w')
        @source = Hpricot(agent.get_file(@url))
        file.puts @source
        file.close
      rescue Timeout::Error
        puts "caught Timeout::Error!"
        retry # タイムアウトしちゃってもあきらめない！
      rescue WWW::Mechanize::ResponseCodeError => e
        case e.response_code
        when "404"
          raise "Net::HTTPNotFound!"
        when "502"
          puts "Net::HTTPBadGateway!"
          retry # 上手くアクセスできないときはもう1回！
        else
          raise "caught Excepcion!" + e.response_code
        end
      rescue => ex
        raise ex.message
      end
    else
      #Twitterから取得しないでローカルファイルから取得してくる
      file = File.open(file_name,'r')
      @source = Hpricot(file.read)
      file.close
    end      

    self.set_count
    self.set_adr
    self.set_bio
    self.set_web
    self.set_entry
    self.set_time_diff
    self.set_time_mean
    self.set_entry_size
    self.set_blog_info
    self.set_is_japanese
    @source = ""
  rescue => ex
    raise ex.message
  end
  
  def set_count
    counter = 0
    (@source/"span.stats_count").each{ |item|
      item = item.inner_text.gsub(/,/,"")
      #1.000みたいなのの処理
      case counter
      when 0
        @following = item
      when 1
        @followers = item
      when 2
        @favorites = item
      when 3
        @update = item
      else
      end
      counter = counter+1
    }
  end

  def set_adr
    @adr = ""
    (@source/"span.adr").each{ |item|
      @adr = @adr+item.inner_text
    }
  end
  
  def set_bio
    @bio = ""
    (@source/"span.bio").each{ |item|
      @bio = @bio+item.inner_text
    }
  end

  def set_web
    @web = ""
    (@source/"a.url").each{ |item|
      @web = @web+item.inner_text
    }
  end
  
  def set_entry
    @entry = ""
    (source/"span.entry-content").each{ |item|
      @entry = @entry+item.inner_text
    }
    @entry = @entry.gsub(/\t/,"")
    @entry = @entry.gsub(/\s/,"")
  end

  def set_time_diff
    @time_diff = 0
    
    time_published = (@source/"abbr.published").map{|item|item.attributes['title']}
    @time_diff = (Time.parse(time_published[0])-Time.parse(time_published[time_published.length-1])).to_s
  rescue
    raise "時間差を計算する要素が存在しません。"
  end
  
  def set_time_mean
    @time_mean = 0
    time = (@source/"abbr.published")[0..(@source/"abbr.published").length-1].map{|item| Time.parse(item.attributes['title'])}
    aaa =[]
    aaa[0] = time[0..time.length-1]
    aaa[1] = time[1..time.length-2]
    
    bbb = aaa.map {|x| [x[0]-x[1]]}
    @time_mean = bbb.flatten.inject(0){|result, item| result + item } / (@source/"abbr.published").length
  rescue
    raise "投稿時間の平均を計算するための要素が存在しません。"
  end
  
  def set_blog_info
    @blog_info = ""
    @blog_url = ""

    (source/"ul.about>li>a").each{ |item|
      @blog_url = item[:href]
    }
    agent = WWW::Mechanize.new
    agent.max_history = 1
    begin    
    @blog_info = Hpricot(agent.get_file(@blog_url)).inner_text if @blog_url.length != 0
    #urlが書いてあれば取得してくる
    rescue Timeout::Error
      puts "caught Timeout::Error!"
      retry # タイムアウトしちゃってもあきらめない！
    rescue WWW::Mechanize::ResponseCodeError => e
      case e.response_code
      when "404"
        raise "Net::HTTPNotFound!"
      when "502"
        puts "Net::HTTPBadGateway!"
        retry # 上手くアクセスできないときはもう1回！
      else
        raise "caught Excepcion!" + e.response_code
      end
    rescue => ex
      raise ex.message
    end
  end
  
  def set_entry_size
    @entry_size = 0
    @entry_size = @entry.length
  end

  def set_is_japanese
    @is_japanese = 0
    if @entry.toeuc.gsub(/[^一-龠]/, '').length > 0
      @is_japanese = 1
    else
      @is_japanese = 0
    end
  end

  def write_friends_info
    file_name = "./twitter/" + self.name + ".txt"

    otaku = ["オタク","アニメ","anime","ゲーム","漫画","ヲタ","同人","ハルヒ","コスプレ","幼女"]
    students = ["student","大学生","大学院生","研究","学生","大学","勉強","university","University","数学","物理","バイオ","CS","自然言語"]
    program = ["プログラマ","プログラム","Ruby","ruby","Perl","perl","Gauche","Program","Java","java","アルゴリズム","JavaScript","javascript","C++","Python","python","ActionScript","Software","Developer","SE","mysql"]
    begin
      file = File.open(file_name,'w')
      ##name
      file.print "\""+self.name+"\","
      ##is_follow
      file.print "\""+self.flag.to_s+"\","
      ##following
      file.print "\""+self.following+"\","
      ##followers
      file.print "\""+self.followers+"\","
      ##favorites
      file.print "\""+self.favorites+"\","
      ##update
      file.print "\""+self.update+"\","
      ##time_diff
      file.print "\""+self.time_diff.to_s+"\","
      ##time_mean
      file.print "\""+self.time_mean.to_s+"\","
      ##entry_size
      file.print "\""+self.entry_size.to_s+"\","
      ##is_japanese
      file.print "\""+self.is_japanese.to_s+"\","
      ##is_tsukuba
      file.print "\""+has_words(extract_meisi(self.adr.toeuc),["tsukuba","Tsukuba","筑波","つくば"])+"\","
      ##is_friend
      file.print "\""+has_words(extract_meisi(self.entry.toeuc),["T_Hash","yaotti","skylab13","dritoshi","Misho","Muichkine","ma_ko","nakanishi65","blanc_et_noir","yuzuhara","y_benjo","faultier","wakuteka","_a_u","kaeru_san","suu_g","ryo_grid","ryu_higa","ooue50","yuyarin","kis","beatinaniwa","suztomo","hayamiz","anemo"])+"\","
      ##is_tokyo
      file.print "\""+has_words(extract_meisi(self.adr.toeuc),["tokyo","Tokyo","東京"])+"\","
      ##is_blog_in_web
      file.print "\""+has_words(extract_meisi(self.web.toeuc),["http"])+"\","
      ##is_blog_in_bio
      file.print "\""+has_words(extract_meisi(self.bio.toeuc),["http"])+"\","
      ##is_otaku
      file.print "\""+has_words(extract_meisi(self.bio.toeuc),otaku)+"\","
      ##is_students
      file.print "\""+has_words(extract_meisi(self.bio.toeuc),students)+"\","
      ##is_program
      file.print "\""+has_words(extract_meisi(self.entry.toeuc),program)+"\","
      ##at_mark
      file.print "\""+extract_meisi(self.entry.toeuc).grep("@").length.to_s+"\","
      ##is_program_in_blog
      file.print "\""+has_words(extract_meisi(self.blog_info.toeuc),program)+"\","
      ##is_otaku_in_blog
      file.print "\""+has_words(extract_meisi(self.blog_info.toeuc),otaku)+"\","
      ##is_students_in_blog
      file.print "\""+has_words(extract_meisi(self.blog_info.toeuc),students)+"\","
      ##is_program_by_bayes
      file.print "\""+self.is_program_by_bayes+"\","
      ##is_otaku_in_bio_by_bayes
      file.print "\""+self.is_otaku_in_bio_by_bayes+"\","
      ##is_otaku_in_entry_by_bayes
      file.print "\""+self.is_otaku_in_entry_by_bayes+"\""
      file.puts
      file.close
      puts "wrote infomation about "+self.name
    rescue => ex
      puts ex.backtrace
      puts ex.message
      File.unlink(file_name)
      puts "deleteed file "+file_name
    end
  end
end  

def extract_meisi(str)
  c = MeCab::Tagger.new("-Ochasen") 
  n = c.parseToNode(str) 
  list = Array.new
  while n do
    f = n.feature.split(/,/) 
    if /名詞/ =~ f[0]
      list.push(n.surface)
    end
    n = n.next
  end 
  return list
  #list.uniqじゃなくした
end

def has_words(list,words)
  if (list & words).length == 0
    return 0.to_s
  else
    return 1.to_s
  end
end

def write_friends_page(name,num,agent)
  (1..num).each{|n|
    url = "http://twitter.com/"+name+"/friends?page="+n.to_s
    file_name = "./twitter/"+name+"-"+n.to_s+".html"
    if File.exist?(file_name)
      file = File.open(file_name,'r')
      source = Hpricot(file.read)
      file.close
    else
      file = File.open(file_name,'w')
      source = Hpricot(agent.get_file(url))
      file.puts source
      file.close
    end
  }
end

def write_followers_page(num,agent)
  (1..num).each{|n|
    url = "http://twitter.com/followers?page="+n.to_s
    file_name = "./twitter/followers-"+n.to_s+".html"
    if File.exist?(file_name)
      puts "do nothing"
#      file = File.open(file_name,'r')
#      source = Hpricot(file.read)
#      file.close
    else
      file = File.open(file_name,'w')
      source = agent.get_file(url)
      puts url
      file.puts source
      file.close
    end
  }
end

def get_friends(name,num,agent=agent)
  friends = []
  (1..num).each{|n|
    file_name = "./twitter/"+name+"-"+n.to_s+".html"
    file = File.open(file_name,'r')
    source = Hpricot(file.read)
    file.close
    (source/"a.url").each{ |item|
      begin
        friends.push item.inner_text
      rescue => ex
        print ex.message,"\n"
      end
      puts item.inner_text+" was added in the friends list."
    }
  }
  return friends
end

def get_followers(num,agent=agent)
  friends = []
  puts "start"
  (1..num).each{|n|
    file_name = "./twitter/followers-"+n.to_s+".html"
    puts file_name
    file = File.open(file_name,'r')
    source = Hpricot(file.read)
    file.close
    (source/"a.url").each{ |item|
      begin
        friends.push item.inner_text
      rescue => ex
        print ex.message,"\n"
      end
      puts item.inner_text+" was added in the followers list."
    }
  }
  return friends
end

def print_keyword_index(friends,threshold=5)
  #キーワードの取得などをする
  puts "Started calc about how many appeared in the keyword list."
  keyword = Array.new
  friends.each{|friend|
    begin
      keyword = keyword+extract_meisi(friend.bio.toeuc)
      sleep 1
    rescue => ex
      print ex.message,"\n"
    end
  }
  
  count = Hash.new(0)
  
  keyword.each{|word|
    count[word] +=1
  }
  
  count.sort{|a,b|
    b[1]<=>a[1]   # 降順
  }.each{|key,value|
    if value > threshold
      print "#{key}: #{value}\n"
    end
  }
  puts "Ended calc about how many appeared in the keyword list."
end

def is_program_by_bayes(bayes,sample)
  result = bayes.classify(sample)
  if result.to_s == 'Program'
    return "1"
  else
    return "0"
  end
end

def is_otaku_by_bayes(bayes,sample)
  result = bayes.classify(sample)
  if result.to_s == 'Otaku'
    return "1"
  else
    return "0"
  end
end

def get_friends_for_prediction(list)
  agent = WWW::Mechanize.new
  list.each{|name|
    url = "http://twitter.com/" + name
    file_name = "./twitter/"+name+".html"
    begin
      file = File.open(file_name,'w')
      source = agent.get_file(url)
      file.puts source
      file.close
    rescue Timeout::Error
      puts "caught Timeout::Error!"
      retry # タイムアウトしちゃってもあきらめない！
    rescue WWW::Mechanize::ResponseCodeError => e
      case e.response_code
      when "404"
        raise "Net::HTTPNotFound!"
      when "502"
        puts "Net::HTTPBadGateway!"
        retry # 上手くアクセスできないときはもう1回！
      else
        raise "caught Excepcion!" + e.response_code
      end
    rescue => ex
      puts ex.message
      retry # 上手くアクセスできないときはもう1回！
    end
  }
end