followersとかfollowingとかの人の情報を持っておくためのクラス。何か長いようですが、フラグを付けるためにいろんなメソッドがいろいろあるだけで難しいことはやってません。Hpricotとかで必要な要素を取ってきてます。例外処理とかよく分からない。あと、クラスのメソッドとして定義するべきかとかの辺もよく分からない。
#!/usr/bin/ruby -Ke require 'rubygems' require 'hpricot' require 'mechanize' require 'kconv' require 'yaml' require 'MeCab' require 'twitter' $KCODE='e' #friendの情報を持っておくためのクラス class Friend attr_accessor :name attr_accessor :source attr_accessor :agent attr_accessor :flag attr_accessor :following attr_accessor :followers attr_accessor :favorites attr_accessor :update attr_accessor :adr attr_accessor :web attr_accessor :bio attr_accessor :entry attr_accessor :time_diff attr_accessor :time_mean attr_accessor :entry_size attr_accessor :blog_info attr_accessor :blog_url attr_accessor :is_program_by_bayes attr_accessor :is_otaku_in_entry_by_bayes attr_accessor :is_otaku_in_bio_by_bayes attr_accessor :at_mark attr_accessor :is_japanese def initialize(twitter) #protectかかってる人の発言はログインしないととれないので、ログインしたagentを渡す @name = twitter[:name] @agent = twitter[:agent] @url = "http://twitter.com/"+@name @flag = twitter[:flag] file_name = "./twitter/"+@name+".html" rewrite = twitter[:rewrite] if rewrite == 1 begin file = File.open(file_name,'w') @source = Hpricot(agent.get_file(@url)) file.puts @source file.close rescue Timeout::Error puts "caught Timeout::Error!" retry # タイムアウトしちゃってもあきらめない! rescue WWW::Mechanize::ResponseCodeError => e case e.response_code when "404" raise "Net::HTTPNotFound!" when "502" puts "Net::HTTPBadGateway!" retry # 上手くアクセスできないときはもう1回! else raise "caught Excepcion!" + e.response_code end rescue => ex raise ex.message end else #Twitterから取得しないでローカルファイルから取得してくる file = File.open(file_name,'r') @source = Hpricot(file.read) file.close end self.set_count self.set_adr self.set_bio self.set_web self.set_entry self.set_time_diff self.set_time_mean self.set_entry_size self.set_blog_info self.set_is_japanese @source = "" rescue => ex raise ex.message end def set_count counter = 0 (@source/"span.stats_count").each{ |item| item = item.inner_text.gsub(/,/,"") #1.000みたいなのの処理 case counter when 0 @following = item when 1 @followers = item when 2 @favorites = item when 3 @update = item else end counter = counter+1 } end def set_adr @adr = "" (@source/"span.adr").each{ |item| @adr = @adr+item.inner_text } end def set_bio @bio = "" (@source/"span.bio").each{ |item| @bio = @bio+item.inner_text } end def set_web @web = "" (@source/"a.url").each{ |item| @web = @web+item.inner_text } end def set_entry @entry = "" (source/"span.entry-content").each{ |item| @entry = @entry+item.inner_text } @entry = @entry.gsub(/\t/,"") @entry = @entry.gsub(/\s/,"") end def set_time_diff @time_diff = 0 time_published = (@source/"abbr.published").map{|item|item.attributes['title']} @time_diff = (Time.parse(time_published[0])-Time.parse(time_published[time_published.length-1])).to_s rescue raise "時間差を計算する要素が存在しません。" end def set_time_mean @time_mean = 0 time = (@source/"abbr.published")[0..(@source/"abbr.published").length-1].map{|item| Time.parse(item.attributes['title'])} aaa =[] aaa[0] = time[0..time.length-1] aaa[1] = time[1..time.length-2] bbb = aaa.map {|x| [x[0]-x[1]]} @time_mean = bbb.flatten.inject(0){|result, item| result + item } / (@source/"abbr.published").length rescue raise "投稿時間の平均を計算するための要素が存在しません。" end def set_blog_info @blog_info = "" @blog_url = "" (source/"ul.about>li>a").each{ |item| @blog_url = item[:href] } agent = WWW::Mechanize.new agent.max_history = 1 begin @blog_info = Hpricot(agent.get_file(@blog_url)).inner_text if @blog_url.length != 0 #urlが書いてあれば取得してくる rescue Timeout::Error puts "caught Timeout::Error!" retry # タイムアウトしちゃってもあきらめない! rescue WWW::Mechanize::ResponseCodeError => e case e.response_code when "404" raise "Net::HTTPNotFound!" when "502" puts "Net::HTTPBadGateway!" retry # 上手くアクセスできないときはもう1回! else raise "caught Excepcion!" + e.response_code end rescue => ex raise ex.message end end def set_entry_size @entry_size = 0 @entry_size = @entry.length end def set_is_japanese @is_japanese = 0 if @entry.toeuc.gsub(/[^一-龠]/, '').length > 0 @is_japanese = 1 else @is_japanese = 0 end end def write_friends_info file_name = "./twitter/" + self.name + ".txt" otaku = ["オタク","アニメ","anime","ゲーム","漫画","ヲタ","同人","ハルヒ","コスプレ","幼女"] students = ["student","大学生","大学院生","研究","学生","大学","勉強","university","University","数学","物理","バイオ","CS","自然言語"] program = ["プログラマ","プログラム","Ruby","ruby","Perl","perl","Gauche","Program","Java","java","アルゴリズム","JavaScript","javascript","C++","Python","python","ActionScript","Software","Developer","SE","mysql"] begin file = File.open(file_name,'w') ##name file.print "\""+self.name+"\"," ##is_follow file.print "\""+self.flag.to_s+"\"," ##following file.print "\""+self.following+"\"," ##followers file.print "\""+self.followers+"\"," ##favorites file.print "\""+self.favorites+"\"," ##update file.print "\""+self.update+"\"," ##time_diff file.print "\""+self.time_diff.to_s+"\"," ##time_mean file.print "\""+self.time_mean.to_s+"\"," ##entry_size file.print "\""+self.entry_size.to_s+"\"," ##is_japanese file.print "\""+self.is_japanese.to_s+"\"," ##is_tsukuba file.print "\""+has_words(extract_meisi(self.adr.toeuc),["tsukuba","Tsukuba","筑波","つくば"])+"\"," ##is_friend file.print "\""+has_words(extract_meisi(self.entry.toeuc),["T_Hash","yaotti","skylab13","dritoshi","Misho","Muichkine","ma_ko","nakanishi65","blanc_et_noir","yuzuhara","y_benjo","faultier","wakuteka","_a_u","kaeru_san","suu_g","ryo_grid","ryu_higa","ooue50","yuyarin","kis","beatinaniwa","suztomo","hayamiz","anemo"])+"\"," ##is_tokyo file.print "\""+has_words(extract_meisi(self.adr.toeuc),["tokyo","Tokyo","東京"])+"\"," ##is_blog_in_web file.print "\""+has_words(extract_meisi(self.web.toeuc),["http"])+"\"," ##is_blog_in_bio file.print "\""+has_words(extract_meisi(self.bio.toeuc),["http"])+"\"," ##is_otaku file.print "\""+has_words(extract_meisi(self.bio.toeuc),otaku)+"\"," ##is_students file.print "\""+has_words(extract_meisi(self.bio.toeuc),students)+"\"," ##is_program file.print "\""+has_words(extract_meisi(self.entry.toeuc),program)+"\"," ##at_mark file.print "\""+extract_meisi(self.entry.toeuc).grep("@").length.to_s+"\"," ##is_program_in_blog file.print "\""+has_words(extract_meisi(self.blog_info.toeuc),program)+"\"," ##is_otaku_in_blog file.print "\""+has_words(extract_meisi(self.blog_info.toeuc),otaku)+"\"," ##is_students_in_blog file.print "\""+has_words(extract_meisi(self.blog_info.toeuc),students)+"\"," ##is_program_by_bayes file.print "\""+self.is_program_by_bayes+"\"," ##is_otaku_in_bio_by_bayes file.print "\""+self.is_otaku_in_bio_by_bayes+"\"," ##is_otaku_in_entry_by_bayes file.print "\""+self.is_otaku_in_entry_by_bayes+"\"" file.puts file.close puts "wrote infomation about "+self.name rescue => ex puts ex.backtrace puts ex.message File.unlink(file_name) puts "deleteed file "+file_name end end end def extract_meisi(str) c = MeCab::Tagger.new("-Ochasen") n = c.parseToNode(str) list = Array.new while n do f = n.feature.split(/,/) if /名詞/ =~ f[0] list.push(n.surface) end n = n.next end return list #list.uniqじゃなくした end def has_words(list,words) if (list & words).length == 0 return 0.to_s else return 1.to_s end end def write_friends_page(name,num,agent) (1..num).each{|n| url = "http://twitter.com/"+name+"/friends?page="+n.to_s file_name = "./twitter/"+name+"-"+n.to_s+".html" if File.exist?(file_name) file = File.open(file_name,'r') source = Hpricot(file.read) file.close else file = File.open(file_name,'w') source = Hpricot(agent.get_file(url)) file.puts source file.close end } end def write_followers_page(num,agent) (1..num).each{|n| url = "http://twitter.com/followers?page="+n.to_s file_name = "./twitter/followers-"+n.to_s+".html" if File.exist?(file_name) puts "do nothing" # file = File.open(file_name,'r') # source = Hpricot(file.read) # file.close else file = File.open(file_name,'w') source = agent.get_file(url) puts url file.puts source file.close end } end def get_friends(name,num,agent=agent) friends = [] (1..num).each{|n| file_name = "./twitter/"+name+"-"+n.to_s+".html" file = File.open(file_name,'r') source = Hpricot(file.read) file.close (source/"a.url").each{ |item| begin friends.push item.inner_text rescue => ex print ex.message,"\n" end puts item.inner_text+" was added in the friends list." } } return friends end def get_followers(num,agent=agent) friends = [] puts "start" (1..num).each{|n| file_name = "./twitter/followers-"+n.to_s+".html" puts file_name file = File.open(file_name,'r') source = Hpricot(file.read) file.close (source/"a.url").each{ |item| begin friends.push item.inner_text rescue => ex print ex.message,"\n" end puts item.inner_text+" was added in the followers list." } } return friends end def print_keyword_index(friends,threshold=5) #キーワードの取得などをする puts "Started calc about how many appeared in the keyword list." keyword = Array.new friends.each{|friend| begin keyword = keyword+extract_meisi(friend.bio.toeuc) sleep 1 rescue => ex print ex.message,"\n" end } count = Hash.new(0) keyword.each{|word| count[word] +=1 } count.sort{|a,b| b[1]<=>a[1] # 降順 }.each{|key,value| if value > threshold print "#{key}: #{value}\n" end } puts "Ended calc about how many appeared in the keyword list." end def is_program_by_bayes(bayes,sample) result = bayes.classify(sample) if result.to_s == 'Program' return "1" else return "0" end end def is_otaku_by_bayes(bayes,sample) result = bayes.classify(sample) if result.to_s == 'Otaku' return "1" else return "0" end end def get_friends_for_prediction(list) agent = WWW::Mechanize.new list.each{|name| url = "http://twitter.com/" + name file_name = "./twitter/"+name+".html" begin file = File.open(file_name,'w') source = agent.get_file(url) file.puts source file.close rescue Timeout::Error puts "caught Timeout::Error!" retry # タイムアウトしちゃってもあきらめない! rescue WWW::Mechanize::ResponseCodeError => e case e.response_code when "404" raise "Net::HTTPNotFound!" when "502" puts "Net::HTTPBadGateway!" retry # 上手くアクセスできないときはもう1回! else raise "caught Excepcion!" + e.response_code end rescue => ex puts ex.message retry # 上手くアクセスできないときはもう1回! end } end