はてなカウンターからアクセスログをちまちま収集して、DBにぶちこんでいくスクリプトを書いた

本当は問題発見と解決くらいの時に、これができてればよかったんだけどねwww。というわけで初めてsqliteとか使ってみました。DBの設計とかよく分からないというか、忘れてしまったので適当。マルチスレッドでやろうとしたんだけど、Singletonパターンでできているオブジェクトの共有とか、DBの書き込みの競合とかおこってわけわけめになったため、まったりと待つバージョンにしました。

main.rb

実行するのはこれ。

require 'hatena_counter'

counters = Counters.new(db="counter.db")

make_date_sequence({"year"=>2007,"month"=>1,"day"=>1},{"year"=>2008,"month"=>5,"day"=>30}).each{|day|
  page = 1
  url = "http://counter.hatena.ne.jp/syou6162/log?cid=1&type=daily&date="+day.to_s+"&page="+page.to_s
  counters.get_counter.set_url(url)
  counters.get_counter.get_logs
  while counters.get_counter.log_exist?
    counters.register(day.to_s,page)
    puts url
    page = page + 1
    counters.get_counter.set_url(url)
    counters.get_counter.get_logs
    url = "http://counter.hatena.ne.jp/syou6162/log?cid=1&type=daily&date="+day.to_s+"&page="+page.to_s
    sleep 3
  end
}

hatena_counter.rb

「そこgetなくてsetだよ」ってのとかがあったけど、とりあえず放置。死んだほうがいい。

# -*- coding: utf-8 -*-

require 'rubygems'
require 'hpricot'
require 'mechanize'
require 'sqlite3'
require 'yaml'
require 'singleton'
require 'kconv'

def make_date_sequence(start,finish)
  days = []
  length = Date.new(finish['year'],finish['month'],finish['day']) - Date.new(start['year'],start['month'],start['day'])
  (0..length).each{|i|
    days.push Date.new(start['year'],start['month'],start['day']) + i
  }
  return days
end

class Counter
  include Singleton
  
  attr_accessor :agent
  attr_accessor :source
  attr_accessor :outer
  
  def initialize
    @agent = WWW::Mechanize.new
    @agent.max_history = 1
    config = YAML.load_file("hatena.yaml")
    url = "https://www.hatena.ne.jp/login"
    
    page = @agent.get(url)
    login_form = page.forms[0]
    login_form['name'] = config['name']
    login_form['password'] = config['password']
    
    @agent.submit(login_form)
  end
  
  def set_url(url)
    begin
      @source = Hpricot(@agent.get_file(url))

    rescue Timeout::Error
      puts "caught Timeout::Error!"
      retry # タイムアウトしちゃってもあきらめない!
    rescue WWW::Mechanize::ResponseCodeError => e
      case e.response_code
      when "404"
        raise "Net::HTTPNotFound!"
      when "502"
        puts "Net::HTTPBadGateway!"
        retry # 上手くアクセスできないときはもう1回!
      else
        raise "caught Excepcion!" + e.response_code
      end
    rescue => ex
      raise ex.message
    end
  end
  
  def get_logs
    @outer = []
    (@source/"table#log_table>tr").each{ |item|
      inner = []
      (item/:td).each{|inner_item|
        inner.push inner_item.inner_text
      }
      @outer.push inner
    }
    @outer = @outer[1..@outer.length]
    return @outer
  end

  def log_exist?
    return @outer.length != 0 ? TRUE : FALSE
  end
end

class Counters
  attr_accessor :db
  def initialize(db="counter.db")
    @db = db
    sql=<<EOF
        create table counter (
        date text primary key on conflict ignore,
        url  text,
        browser text,
        language text,
        screen text,
        access text,
        link text
    );
EOF
    db = SQLite3::Database.new(@db)
    db.execute(sql) unless db.execute("select tbl_name from sqlite_master where type=?", "table").flatten.include?("counter")
  end

  def register(day,page)
    db = SQLite3::Database.new(@db)
    db.busy_timeout(5000)
    url = "http://counter.hatena.ne.jp/syou6162/log?cid=1&type=daily&date="+day+"&page="+page.to_s
    counter = Counter.instance
    counter.set_url(url)
    logs = counter.get_logs
    db.transaction {|t|
      logs.each{|log|
        begin
          t.execute("insert into counter (date,url,browser,language,screen,access,link) values('#{log[0]}','#{log[1]}','#{log[2]}','#{log[3]}','#{log[4]}','#{log[5]}','#{log[6]}')".toutf8);
        rescue SQLite3::SQLException => ex
          puts ex.message
        end
      }
    }
  end
  
  def get_counter
    return Counter.instance
  end
end

test_hatena_counter.rb

テストとかほとんの書いたことがないゆとりなんですが、なんとなく書いてみた。実行するとこんな感じになるのかー。

/Users/yasuhisa% ruby test_hatena_counter.rb 
Loaded suite test_hatena_counter
Started
.........
Finished in 5.92334 seconds.

9 tests, 418 assertions, 0 failures, 0 errors

で、テストするためのスクリプト。書いた後に「test_1」とかではなくて、本体のメソッドに対応した名前のついたtest用のメソッドが必要であることに気がついたりとかした。

require 'test/unit'
require 'hatena_counter'

class TestHatenaCounter < Test::Unit::TestCase
  def setup
    @counter = Counter.instance
  end

  def test_1
    url = "http://counter.hatena.ne.jp/syou6162/log?cid=1&type=daily&date=2008-03-01&page=1&column=5&column=6"
    @counter.set_url(url)
    @counter.get_logs.each{|item|
      assert_equal(7,item.length)
    }
  end

  def test_2
    url = "http://counter.hatena.ne.jp/syou6162/log?cid=1&type=daily&date=2008-03-01"
    @counter.set_url(url)
    assert_equal(50, @counter.get_logs.length)
    assert_equal(TRUE, @counter.log_exist?)
    @counter.get_logs.each{|item|
      assert_equal(7,item.length)
    }
  end

  def test_3
    url = "http://counter.hatena.ne.jp/syou6162/log?cid=1&type=daily&date=2008-03-01&page=4&column=5&column=6"
    @counter.set_url(url)
    assert_equal(0, @counter.get_logs.length)
    assert_equal(FALSE, @counter.log_exist?)
  end

  def test_4
    url = "http://counter.hatena.ne.jp/syou6162/log?cid=1&type=daily&date=2008-03-02&page=4&column=5&column=6"
    @counter.set_url(url)
    assert_equal(2, @counter.get_logs.length)
    @counter.get_logs.each{|item|
      assert_equal(7,item.length)
    }
  end

  def test_5
    url = "http://counter.hatena.ne.jp/syou6162/log?cid=1&type=daily&date=2008-03-32&page=4&column=5&column=6"
    assert_raises(RuntimeError, message="caught 500 error!") { @counter.set_url(url) }
  end

  def test_6
    assert_equal(366,make_date_sequence({"year"=>2008,"month"=>1,"day"=>1},{"year"=>2008,"month"=>12,"day"=>31}).length)
    assert_equal(365,make_date_sequence({"year"=>2007,"month"=>1,"day"=>1},{"year"=>2007,"month"=>12,"day"=>31}).length)
    assert_equal(31,make_date_sequence({"year"=>2008,"month"=>1,"day"=>1},{"year"=>2008,"month"=>1,"day"=>31}).length)
  end

  def test_7
    assert_equal(1,make_date_sequence({"year"=>2008,"month"=>1,"day"=>1},{"year"=>2008,"month"=>1,"day"=>31})[0].day)
  end
  def test_8
    make_date_sequence({"year"=>2007,"month"=>12,"day"=>31},{"year"=>2008,"month"=>1,"day"=>2}).each{|day|
      url = "http://counter.hatena.ne.jp/syou6162/log?cid=1&type=daily&date="+day.to_s
      @counter.set_url(url)
      assert_equal(50,@counter.get_logs.length)
      @counter.get_logs.each{|item|
        assert_equal(7,item.length)
      }
    }
  end

  def test_9
    make_date_sequence({"year"=>2008,"month"=>2,"day"=>28},{"year"=>2008,"month"=>3,"day"=>1}).each{|day|
      url = "http://counter.hatena.ne.jp/syou6162/log?cid=1&type=daily&date="+day.to_s
      @counter.set_url(url)
      assert_equal(50,@counter.get_logs.length)
      @counter.get_logs.each{|item|
        assert_equal(7,item.length)
      }
    }
  end
end