本当は問題発見と解決くらいの時に、これができてればよかったんだけどねwww。というわけで初めてsqliteとか使ってみました。DBの設計とかよく分からないというか、忘れてしまったので適当。マルチスレッドでやろうとしたんだけど、Singletonパターンでできているオブジェクトの共有とか、DBの書き込みの競合とかおこってわけわけめになったため、まったりと待つバージョンにしました。
main.rb
実行するのはこれ。require 'hatena_counter' counters = Counters.new(db="counter.db") make_date_sequence({"year"=>2007,"month"=>1,"day"=>1},{"year"=>2008,"month"=>5,"day"=>30}).each{|day| page = 1 url = "http://counter.hatena.ne.jp/syou6162/log?cid=1&type=daily&date="+day.to_s+"&page="+page.to_s counters.get_counter.set_url(url) counters.get_counter.get_logs while counters.get_counter.log_exist? counters.register(day.to_s,page) puts url page = page + 1 counters.get_counter.set_url(url) counters.get_counter.get_logs url = "http://counter.hatena.ne.jp/syou6162/log?cid=1&type=daily&date="+day.to_s+"&page="+page.to_s sleep 3 end }
hatena_counter.rb
「そこgetなくてsetだよ」ってのとかがあったけど、とりあえず放置。死んだほうがいい。# -*- coding: utf-8 -*- require 'rubygems' require 'hpricot' require 'mechanize' require 'sqlite3' require 'yaml' require 'singleton' require 'kconv' def make_date_sequence(start,finish) days = [] length = Date.new(finish['year'],finish['month'],finish['day']) - Date.new(start['year'],start['month'],start['day']) (0..length).each{|i| days.push Date.new(start['year'],start['month'],start['day']) + i } return days end class Counter include Singleton attr_accessor :agent attr_accessor :source attr_accessor :outer def initialize @agent = WWW::Mechanize.new @agent.max_history = 1 config = YAML.load_file("hatena.yaml") url = "https://www.hatena.ne.jp/login" page = @agent.get(url) login_form = page.forms[0] login_form['name'] = config['name'] login_form['password'] = config['password'] @agent.submit(login_form) end def set_url(url) begin @source = Hpricot(@agent.get_file(url)) rescue Timeout::Error puts "caught Timeout::Error!" retry # タイムアウトしちゃってもあきらめない! rescue WWW::Mechanize::ResponseCodeError => e case e.response_code when "404" raise "Net::HTTPNotFound!" when "502" puts "Net::HTTPBadGateway!" retry # 上手くアクセスできないときはもう1回! else raise "caught Excepcion!" + e.response_code end rescue => ex raise ex.message end end def get_logs @outer = [] (@source/"table#log_table>tr").each{ |item| inner = [] (item/:td).each{|inner_item| inner.push inner_item.inner_text } @outer.push inner } @outer = @outer[1..@outer.length] return @outer end def log_exist? return @outer.length != 0 ? TRUE : FALSE end end class Counters attr_accessor :db def initialize(db="counter.db") @db = db sql=<<EOF create table counter ( date text primary key on conflict ignore, url text, browser text, language text, screen text, access text, link text ); EOF db = SQLite3::Database.new(@db) db.execute(sql) unless db.execute("select tbl_name from sqlite_master where type=?", "table").flatten.include?("counter") end def register(day,page) db = SQLite3::Database.new(@db) db.busy_timeout(5000) url = "http://counter.hatena.ne.jp/syou6162/log?cid=1&type=daily&date="+day+"&page="+page.to_s counter = Counter.instance counter.set_url(url) logs = counter.get_logs db.transaction {|t| logs.each{|log| begin t.execute("insert into counter (date,url,browser,language,screen,access,link) values('#{log[0]}','#{log[1]}','#{log[2]}','#{log[3]}','#{log[4]}','#{log[5]}','#{log[6]}')".toutf8); rescue SQLite3::SQLException => ex puts ex.message end } } end def get_counter return Counter.instance end end
test_hatena_counter.rb
テストとかほとんの書いたことがないゆとりなんですが、なんとなく書いてみた。実行するとこんな感じになるのかー。/Users/yasuhisa% ruby test_hatena_counter.rb Loaded suite test_hatena_counter Started ......... Finished in 5.92334 seconds. 9 tests, 418 assertions, 0 failures, 0 errors
で、テストするためのスクリプト。書いた後に「test_1」とかではなくて、本体のメソッドに対応した名前のついたtest用のメソッドが必要であることに気がついたりとかした。
require 'test/unit' require 'hatena_counter' class TestHatenaCounter < Test::Unit::TestCase def setup @counter = Counter.instance end def test_1 url = "http://counter.hatena.ne.jp/syou6162/log?cid=1&type=daily&date=2008-03-01&page=1&column=5&column=6" @counter.set_url(url) @counter.get_logs.each{|item| assert_equal(7,item.length) } end def test_2 url = "http://counter.hatena.ne.jp/syou6162/log?cid=1&type=daily&date=2008-03-01" @counter.set_url(url) assert_equal(50, @counter.get_logs.length) assert_equal(TRUE, @counter.log_exist?) @counter.get_logs.each{|item| assert_equal(7,item.length) } end def test_3 url = "http://counter.hatena.ne.jp/syou6162/log?cid=1&type=daily&date=2008-03-01&page=4&column=5&column=6" @counter.set_url(url) assert_equal(0, @counter.get_logs.length) assert_equal(FALSE, @counter.log_exist?) end def test_4 url = "http://counter.hatena.ne.jp/syou6162/log?cid=1&type=daily&date=2008-03-02&page=4&column=5&column=6" @counter.set_url(url) assert_equal(2, @counter.get_logs.length) @counter.get_logs.each{|item| assert_equal(7,item.length) } end def test_5 url = "http://counter.hatena.ne.jp/syou6162/log?cid=1&type=daily&date=2008-03-32&page=4&column=5&column=6" assert_raises(RuntimeError, message="caught 500 error!") { @counter.set_url(url) } end def test_6 assert_equal(366,make_date_sequence({"year"=>2008,"month"=>1,"day"=>1},{"year"=>2008,"month"=>12,"day"=>31}).length) assert_equal(365,make_date_sequence({"year"=>2007,"month"=>1,"day"=>1},{"year"=>2007,"month"=>12,"day"=>31}).length) assert_equal(31,make_date_sequence({"year"=>2008,"month"=>1,"day"=>1},{"year"=>2008,"month"=>1,"day"=>31}).length) end def test_7 assert_equal(1,make_date_sequence({"year"=>2008,"month"=>1,"day"=>1},{"year"=>2008,"month"=>1,"day"=>31})[0].day) end def test_8 make_date_sequence({"year"=>2007,"month"=>12,"day"=>31},{"year"=>2008,"month"=>1,"day"=>2}).each{|day| url = "http://counter.hatena.ne.jp/syou6162/log?cid=1&type=daily&date="+day.to_s @counter.set_url(url) assert_equal(50,@counter.get_logs.length) @counter.get_logs.each{|item| assert_equal(7,item.length) } } end def test_9 make_date_sequence({"year"=>2008,"month"=>2,"day"=>28},{"year"=>2008,"month"=>3,"day"=>1}).each{|day| url = "http://counter.hatena.ne.jp/syou6162/log?cid=1&type=daily&date="+day.to_s @counter.set_url(url) assert_equal(50,@counter.get_logs.length) @counter.get_logs.each{|item| assert_equal(7,item.length) } } end end