#include <iostream> #include <fstream> #include <string> #include <vector> #include <map> #include <mecab.h> #include <boost/filesystem/operations.hpp> #include <boost/filesystem/path.hpp> #include <boost/filesystem/fstream.hpp> using namespace std; using namespace boost::filesystem; typedef map<string, int>::const_iterator map_freq_it; //下のみたいに毎回型を書いているのが面倒なときに別名を付けることができるのがtypedef typedef vector<map_freq_it>::const_iterator vec_stu_citer_t; vector<string> split( string s, string c ){ vector<string> ret; for( int i=0, n; i <= s.length(); i=n+1 ){ n = s.find_first_of( c, i ); if( n == string::npos ) n = s.length(); string tmp = s.substr( i, n-i ); ret.push_back(tmp); } return ret; } vector<string> dir_files(string filepath){ path fullPath = complete(path(filepath, native)); vector<string> files; directory_iterator end; for (directory_iterator it(fullPath); it !=end; ++it){ files.push_back(it->leaf()); } return files; } bool compare( const map_freq_it& a, const map_freq_it& b ){ // ファンクタというらしい return ( a->second > b->second ); } map<string, int> term_frequency(string file){ cout << file << endl; std::ifstream fin(file.c_str()); string str; char c; while (fin.get(c)) { str.push_back(c); } MeCab::Tagger *tagger = MeCab::createTagger( "" ); const MeCab::Node *node = tagger->parseToNode( str.c_str() ); map<string, int> freq; map<string, int>::iterator it; for( node=node->next; node->next; node=node->next ){ vector<string> strvec = split( node->feature, "," ); if (strvec[0] == "名詞"){ string noun = strvec[6]; it = freq.find(noun); if (it != freq.end()){ it->second += 1; }else{ freq.insert(pair<string, int>(noun, 1)); } } } delete tagger; return freq; } int main (int argc, char **argv) { string dir = "/Users/yasuhisa/dbcls/pne/year"; path fullPath = complete(path(dir, native)); directory_iterator end; for (directory_iterator it(fullPath); it !=end; ++it){ map<string, int> freq = term_frequency(dir + "/" + it->leaf()); std::ofstream ofs( (dir + "/result_" + it->leaf()).c_str() ); vector<map_freq_it> sorted; for(map_freq_it mfi = freq.begin(); mfi != freq.end(); ++mfi) sorted.push_back(mfi); sort(sorted.begin(), sorted.end(), compare); for(vec_stu_citer_t it = sorted.begin(); it != sorted.end(); ++it){ ofs << (*it)->first << "," << (*it)->second << endl; } } return 0; }
コンパイルに必要なオプションがやたら多いので、そこで苦労したという話もある。。。
/Users/yasuhisa/cpp% g++ -O2 `mecab-config --cflags` mecab.cpp -o mecab `mecab-config --libs` -l boost_system-xgcc40-mt -l boost_filesystem-xgcc40-mt-1_38 -l boost_filesystem-xgcc40-mt
蛋白質核酸酵素の本から文字コード変換とか抽出をやるためのスクリプト
上の作業をするための下準備。
# -*- coding: utf-8 -*- def extract(str) title = "" abstract = "" body = "" str.gsub!(/\n/,"") if str =~ /<日本語タイトル>(.*?)<\/日本語タイトル>/ title = $1 end if str =~ /<要旨>(.*?)<\/要旨>/ abstract = $1 end if str =~ /<本文>(.*?)<\/本文>/ body = $1 body.gsub!(/<.*?>.*?<\/.*?>/,"") end return {"title"=>title, "abstract"=>abstract, "body"=>body} end start_year = 1985 end_year = 2006 Dir.chdir("/Users/yasuhisa/Desktop/pne") (start_year..end_year).each{|year| # ファイルをコピーしてくる system "cp #{year}*/**/*.txt ~/dbcls/pne/#{year}" } puts "finished copying files..." Dir.chdir("/Users/yasuhisa/dbcls/pne") (start_year..end_year).each{|year| # sjisからutfに変換 system "nkf -S -w --overwrite #{year}/*.txt" } puts "finished converting file encoding..." (start_year..end_year).each{|year| # 必要な部分を抜き出す Dir.chdir("/Users/yasuhisa/dbcls/pne/#{year}") Dir.glob("*.txt").each{|f| new_txt = File.basename(f,".txt").sub("\.txt","") + ".txt" puts new_txt old = File.open(f,"r") content = extract(old.read) old.close new_txt = File.open(new_txt,"w") new_txt.puts content["title"] + content["abstract"] + content["body"] new_txt.close } } puts "finished extracting from txt..." Dir.chdir("/Users/yasuhisa/dbcls/pne") (start_year..end_year).each{|year| # ファイルをまとめる system "cat #{year}/*.txt > raw/#{year}.txt" } puts "finished all!!"