読者です 読者をやめる 読者になる 読者になる

年ごとに頻度の高い単語をファイルに出力する

C++
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <map>
#include <mecab.h>
#include <boost/filesystem/operations.hpp>
#include <boost/filesystem/path.hpp>
#include <boost/filesystem/fstream.hpp>

using namespace std;
using namespace boost::filesystem;

typedef map<string, int>::const_iterator map_freq_it; //下のみたいに毎回型を書いているのが面倒なときに別名を付けることができるのがtypedef
typedef vector<map_freq_it>::const_iterator vec_stu_citer_t;

vector<string> split( string s, string c ){
  vector<string> ret;
  for( int i=0, n; i <= s.length(); i=n+1 ){
	n = s.find_first_of( c, i );
	if( n == string::npos ) n = s.length();
	string tmp = s.substr( i, n-i );
	ret.push_back(tmp);
  }
  return ret;
}

vector<string> dir_files(string filepath){
  path fullPath = complete(path(filepath, native));
  vector<string> files;
  directory_iterator end;
  for (directory_iterator it(fullPath); it !=end; ++it){
	files.push_back(it->leaf());
  }
  return files;
}

bool compare( const map_freq_it& a, const map_freq_it& b ){ // ファンクタというらしい
  return ( a->second > b->second );
}

map<string, int> term_frequency(string file){
  cout << file << endl;

  std::ifstream fin(file.c_str());
  string str;
  char c;

  while (fin.get(c)) {
	str.push_back(c);
  }

  MeCab::Tagger *tagger = MeCab::createTagger( "" );
  const MeCab::Node *node = tagger->parseToNode( str.c_str() );
  map<string, int> freq;
  map<string, int>::iterator it;

  for( node=node->next; node->next; node=node->next ){ 
	vector<string> strvec = split( node->feature, "," );
	if (strvec[0] == "名詞"){
	  string noun = strvec[6];
	  it = freq.find(noun);
	  if (it != freq.end()){
		it->second += 1;
	  }else{

		freq.insert(pair<string, int>(noun, 1));
	  }
	}
  } 
  delete tagger;
  return freq;
}

int main (int argc, char **argv) 
{
  string dir = "/Users/yasuhisa/dbcls/pne/year";
  path fullPath = complete(path(dir, native));
  directory_iterator end;
  for (directory_iterator it(fullPath); it !=end; ++it){
	map<string, int> freq = term_frequency(dir + "/" + it->leaf());
	std::ofstream ofs( (dir + "/result_" + it->leaf()).c_str() );
	vector<map_freq_it> sorted;

	for(map_freq_it mfi = freq.begin(); mfi != freq.end(); ++mfi)
	  sorted.push_back(mfi);
	
	sort(sorted.begin(), sorted.end(), compare);
	
	for(vec_stu_citer_t it = sorted.begin(); it != sorted.end(); ++it){
	  ofs << (*it)->first << ","
		  << (*it)->second << endl;
	}
  }
  return 0;
}

コンパイルに必要なオプションがやたら多いので、そこで苦労したという話もある。。。

/Users/yasuhisa/cpp% g++ -O2 `mecab-config --cflags` mecab.cpp -o mecab `mecab-config --libs` -l boost_system-xgcc40-mt -l boost_filesystem-xgcc40-mt-1_38 -l boost_filesystem-xgcc40-mt

蛋白質核酸酵素の本から文字コード変換とか抽出をやるためのスクリプト

上の作業をするための下準備。

# -*- coding: utf-8 -*-

def extract(str)
  title = ""
  abstract = ""
  body = ""
  str.gsub!(/\n/,"")
  if str =~ /<日本語タイトル>(.*?)<\/日本語タイトル>/
    title = $1
  end
  if str =~ /<要旨>(.*?)<\/要旨>/
    abstract = $1
  end
  if str =~ /<本文>(.*?)<\/本文>/
    body = $1
    body.gsub!(/<.*?>.*?<\/.*?>/,"")
  end
  return {"title"=>title, "abstract"=>abstract, "body"=>body}
end

start_year = 1985
end_year = 2006

Dir.chdir("/Users/yasuhisa/Desktop/pne")

(start_year..end_year).each{|year|
  # ファイルをコピーしてくる
  system "cp #{year}*/**/*.txt ~/dbcls/pne/#{year}"
}
puts "finished copying files..."

Dir.chdir("/Users/yasuhisa/dbcls/pne")
(start_year..end_year).each{|year|
  # sjisからutfに変換
  system "nkf -S -w --overwrite #{year}/*.txt"
}
puts "finished converting file encoding..."

(start_year..end_year).each{|year|
  # 必要な部分を抜き出す
  Dir.chdir("/Users/yasuhisa/dbcls/pne/#{year}")
  
  Dir.glob("*.txt").each{|f|
    new_txt = File.basename(f,".txt").sub("\.txt","") + ".txt"
    puts new_txt
    old = File.open(f,"r")
    content = extract(old.read)
    old.close
    new_txt = File.open(new_txt,"w")
    new_txt.puts content["title"] + content["abstract"] + content["body"]
    new_txt.close
  }
}
puts "finished extracting from txt..."

Dir.chdir("/Users/yasuhisa/dbcls/pne")
(start_year..end_year).each{|year|
  # ファイルをまとめる
  system "cat #{year}/*.txt > raw/#{year}.txt"
}
puts "finished all!!"