文脈の情報も取り入れつつ特徴ベクトルを構築する、の続き、の続き

こことかここの続き。typenameとかのところではまってコンパイルできない＞＜とかやってました。
前回のSVMに投げたところでは、複合名詞中に何の単語が登場したか、という超単純な情報しか見ていなかったんだけど、今度は2つ前後の単語や語頭語尾の情報なども取り入れるようにしたので、前回の結果より大分ましになったかなと思います。
学習するために必要な時間も一時間くらいで割と現実的。ただ、問題としては
今は(名詞|接頭辞|記号)などを連結してできる最長なもので周辺の情報を考えている
- その部分文字列が専門用語であるかの判定とか、そういうのはやっていない
- この辺はちょっと重要な気がするので、来週取り組みたい
入力の改行コードが変なところがあるので、その辺で変なものが混ったりしている
以下はC++のコード。これの出力をsvm_lightに投げています。
// SVMで専門用語かどうかの判別問題を解かせるためのプログラム
// 前後の単語の情報や、複合名詞の最初の単名詞、最後の単名詞も素性として組み込む

#include <fstream>
#include <iostream>
#include <sstream>
#include <numeric>
#include <sstream>
#include <string>
#include <set>
#include <map>
#include <vector>
#include <algorithm>
#include <boost/filesystem/operations.hpp>
#include <boost/algorithm/string.hpp>
#include <boost/filesystem/path.hpp>
#include <boost/filesystem/fstream.hpp>
#include <mecab.h>

std::string itos(int i) { // intをstringに変換
  std::ostringstream s;
  s << i;
  return s.str();
}

std::vector<std::string> str2vec(std::string& str) {
  // stringを形態素解析して分解する
  std::vector<std::string> result;
  MeCab::Tagger *tagger = MeCab::createTagger("");
  const MeCab::Node *node = tagger->parseToNode( str.c_str() );
  for( node=node->next; node->next; node=node->next ){ 
	// 形態素解析されて出てきた文字列(品詞の情報ではなく)
	char *s = new char [node->length + 1];
	strncpy(s, node->surface, node->length);
	s[node->length] = '\0';
	result.push_back(s);
  } 	
  delete tagger;
  return result;
}

// 素性のid付けをするための構造体
// 文字列をkeyとして、idを戻り値として返す
// 登録されていなかれば、新たなものとして登録
template <class T>
struct map_for_feature_vector { 
  std::map<T, int> str2id;
  // &#12098;字列から素性番号への連想配列 
  std::vector<T> id2str; 
  // 素性番号から&#12098;字列への逆引き 
  int getID(const T& key){ 
	typename std::map<T, int>::const_iterator it = str2id.find(key); // typenameに注意
	if (it != str2id.end()){ 
	  return it->second;   // 登録済みのIDを返す 
	} else {               // 登録されてない 
	  const int newID= str2id.size() + 1; 
	  str2id[key] = newID; // 登録 
	  id2str.push_back(key); 
	  return newID;        // 新しいiDを返す 
	} 
  };
  int size () {
	return str2id.size();
  }
};

class Word { // 単語の情報を持っておかせるためのクラス
public:
  std::string word;
  std::string pos;
  Word() {
  };
  Word(std::string word, std::string pos){ // 単語とその品詞をコンストラクタの引数とする
	this->word = word;
	this->pos = pos;
  };
  virtual ~Word(){
  };
};

bool operator<(const Word& a, const Word& b) {
  return a.word < b.word;
};

bool compare(Word& a, Word& b) { 
  return a.word < b.word;
};

class Context { // 中心としている単語の周辺の情報をまとめるクラス
public:
  std::vector<std::string> words; // 考える中心となる複合名詞
  std::string head; // 複合名詞の先頭語  
  std::string tail; // 複合名詞の末尾
  std::vector<Word> prefix; // 複合名詞の前にある単語群
  std::vector<Word> suffix; // 複合名詞の後にある単語群
  int t; // 正例、負例、学習したいデータの値  
  Context(){
  };
  void clear(){ // 全てのvectorの情報をclearする
	words.clear();
	head.erase();
	tail.erase();
	prefix.clear();
	suffix.clear();
  };
  std::ostream& print(std::ostream& os) { // 結果を確認するための出力用関数
	for (std::vector<Word>::iterator it = prefix.begin(); it != prefix.end(); ++it) {
	  os << "\t" << (*it).word << "(" << (*it).pos << ")";
	  os << std::endl;
	}
	for (std::vector<std::string>::iterator it = words.begin(); it != words.end(); ++it) {
	  os << *it;
	}
	os << "(" << head << ", " << tail << ") : " << t;
	os << std::endl;
	for (std::vector<Word>::iterator it = suffix.begin(); it != suffix.end(); ++it) {
	  os << "\t" << (*it).word << "(" << (*it).pos << ")";
	  os << std::endl;
	}
	os << std::endl;
	return os;
  }
  virtual ~Context(){ 
  };
  std::string get_words(){ // vector<string>をinjectする
	std::string s;
	s = accumulate(words.begin(), words.end(), s);
	return s;
  };
  Word get_n_prefix(int n) { // n+1個前の単語の情報を取ってくる
	int c = 0;
	std::vector<Word>::reverse_iterator prev_context_it = prefix.rbegin();
	while (prev_context_it != prefix.rend()) {
	  if (c == n) {
		return *prev_context_it;
	  } else if(c < n) {
		prev_context_it++;
		c++;
	  } else {
		break;
	  }
	}
	Word w;
	return w;
  };
  Word get_n_suffix (int n) { // n+1個後の単語の情報を取ってくる
	int c = 0;
	std::vector<Word>::iterator next_context_it = suffix.begin();
	while (next_context_it != suffix.end()) {
	  if (c == n) {
		return *next_context_it;
	  } else if(c < n) {
		next_context_it++;
		c++;
	  } else {
		break;
	  }
	}
	Word w;
	return w;
  };
};

// 複合名詞の集まりを扱うクラス
class CompoundNounList { 
public:
  // sequenceとして許容する品詞の集合
  std::set<std::string> accept_pos_vec; 
  // 単語をkeyとして、そのコンテクスト(文脈)の情報をvalueとするmultimap
  std::multimap<std::string, Context> list;

  // idを振るための構造体群
  map_for_feature_vector<Word> prev2context;
  map_for_feature_vector<Word> prev_context;
  map_for_feature_vector<std::string> heads;
  map_for_feature_vector<std::string> tails;
  map_for_feature_vector<std::string> words;
  map_for_feature_vector<Word> next_context;
  map_for_feature_vector<Word> next2context;

  map_for_feature_vector<std::string> pos_id; // 品詞を投げたら対応するidを返すための構造体

  CompoundNounList(){ // 連結する品詞の種類
	accept_pos_vec.insert("名詞");
	accept_pos_vec.insert("接頭詞");
	accept_pos_vec.insert("記号");
  };

  std::multimap<std::string, Context> add_text(std::string& text, int t) {
	// textを入力として、前後の文脈の情報などを登録していく
	// tは正例、負例、学習データなどのフラグ
	std::multimap<std::string, Context> added_compound_noun_list; // 今回のtextで追加される素性の情報
	MeCab::Tagger *tagger = MeCab::createTagger("-O wakati");
	const MeCab::Node *node = tagger->parseToNode(text.c_str());
	std::string prev_pos; // 前の単語の品詞
	Context prev_context;
	Context current_context;
	Context next_context;
	for( node=node->next; node->next; node=node->next ){ 
	  char *s = new char [node->length + 1]; // 形態素解析されて出てきた文字列(品詞の情報ではなく)
	  strncpy(s, node->surface, node->length);
	  s[node->length] = '\0';
	  std::vector<std::string> strvec;
	  boost::algorithm::split(strvec, node->feature, boost::algorithm::is_any_of(","));
	  // strvec[0]は形態素解析された結果
	  if (accept_pos_vec.find(strvec[0]) != accept_pos_vec.end()){ // 今の単語の品詞が名詞である場合
		current_context.words.push_back(s);
		if (accept_pos_vec.find(prev_pos) == accept_pos_vec.end()) { // 前の単語の品詞が名詞ではない場合
		  current_context.head = s;
		  if (!prev_context.words.empty()) {
			prev_context.t = t;
			added_compound_noun_list.insert(std::pair<std::string, Context>(prev_context.get_words(), prev_context));
			list.insert(std::pair<std::string, Context>(prev_context.get_words(), prev_context));
		  }
		}
	  } else { // 今の単語の品詞が名詞ではない場合
		// が、前の単語の品詞は名詞である場合
		if (accept_pos_vec.find(prev_pos) != accept_pos_vec.end()) { 
		  Word w(std::string(s), strvec[0]);
		  current_context.suffix.push_back(w);
		  next_context.prefix.push_back(w);
		  current_context.tail = current_context.words.at(current_context.words.size()-1);

		  // contextを一つづつずらしていく
		  prev_context.clear();
		  prev_context = current_context;
		  current_context.clear();
		  current_context = next_context;
		  next_context.clear();
		} else { // 前も今の単語も名詞でない場合
		  Word w(std::string(s), strvec[0]);
		  prev_context.suffix.push_back(w);
		  current_context.prefix.push_back(w);
		}
	  }
	  prev_pos = strvec[0];
	}
	if (!current_context.words.empty()) { // 最後の文字が名詞の場合
	  current_context.t = t;
	  added_compound_noun_list.insert(std::pair<std::string, Context>(current_context.get_words(), current_context));
	  list.insert(std::pair<std::string, Context>(current_context.get_words(), current_context));
	}
	delete tagger;
	register_for_map(added_compound_noun_list);
	return added_compound_noun_list;
  };
  
  std::ostream& print(std::ostream& os) { // multimapのそれぞれiteratorを回してそれぞれ出力するための関数
	for (std::multimap<std::string, Context>::iterator it = list.begin(); it != list.end(); ++it) {
	  (*it).second.print(os);
	  os << std::endl;
	}
	return os;
  }
  
  virtual ~CompoundNounList(){
  };

  void register_for_map(std::multimap<std::string, Context>& list){ // 前後の文脈などのidを振っていくための関数
	for (std::multimap<std::string, Context>::iterator list_it = list.begin(); list_it != list.end(); ++list_it) {

	  // 見ている複合名詞の前の単語の品詞を登録
	  Context c = (*list_it).second;
	  std::vector<Word> prefix = c.prefix;
	  for (std::vector<Word>::iterator word_it = prefix.begin(); word_it != prefix.end(); ++word_it) {
		pos_id.getID((*word_it).pos); 
	  }
	  // 見ている複合名詞の後の単語の品詞を登録
	  std::vector<Word> suffix = c.suffix;
	  for (std::vector<Word>::iterator word_it = suffix.begin(); word_it != suffix.end(); ++word_it) {
		pos_id.getID((*word_it).pos); 
	  }
	  
	  register_for_prev_context(c);
	  heads.getID(c.head);
	  tails.getID(c.tail);
	  words.getID(c.head);
	  register_for_next_context(c);
	}  
  };
private:
  void register_for_prev_context(Context& c){ // 前の文脈をidに登録していくための関数
	std::vector<Word>::reverse_iterator prev_context_it = c.prefix.rbegin();
	if (prev_context_it != c.prefix.rend()) {
	  prev_context.getID(*prev_context_it);
	  prev_context_it++;
	  if (prev_context_it != c.prefix.rend()) {
		prev2context.getID(*prev_context_it);
	  }
	}
  };

  void register_for_next_context(Context& c){ // 前の文脈をidに登録していくための関数
	std::vector<Word>::iterator next_context_it = c.prefix.begin();
	if (next_context_it != c.prefix.end()) {
	  next_context.getID(*next_context_it);
	  next_context_it++;
	  if (next_context_it != c.prefix.end()) {
		next2context.getID(*next_context_it);
	  }
	}
  };
};

// 教師データ、学習データをテキストに吐き出す時の関数
std::ostream& print_data_info(CompoundNounList& cl, std::ostream& os) {
  std::multimap<std::string, Context> list = cl.list;
  for (std::multimap<std::string, Context>::iterator it = list.begin(); it != list.end(); ++it) {
	Context c = (*it).second;
	int cumsum = 0;
	os << c.t << " ";
	os << cl.prev2context.getID(c.get_n_prefix(0)) + cumsum << ":1 ";
	cumsum += cl.prev2context.size();
	os << cl.prev_context.getID(c.get_n_prefix(1)) + cumsum << ":1 ";
	cumsum += cl.prev_context.size();
	os << cl.heads.getID(c.head) + cumsum << ":1 ";
	cumsum += cl.heads.size();
	os << cl.tails.getID(c.tail) + cumsum << ":1 ";
	cumsum += cl.tails.size();
	std::vector<std::string> words = c.words;
	std::vector<int> words_num;
	sort(words.begin(), words.end());
	words.erase(unique(words.begin(), words.end()), words.end());
	for (std::vector<std::string>::iterator witr = words.begin(); witr != words.end(); ++witr) {
	  words_num.push_back(cl.words.getID(*witr) + cumsum);
	}
	sort(words_num.begin(), words_num.end());
	for (std::vector<int>::iterator witr = words_num.begin(); witr != words_num.end(); ++witr) {
	  os << *witr << ":1 ";
	}
	cumsum += cl.words.size();
	os << cl.next_context.getID(c.get_n_suffix(0)) + cumsum << ":1 ";
	cumsum += cl.next_context.size();
	os << cl.next2context.getID(c.get_n_suffix(1)) + cumsum << ":1 " << std::endl;
  }
  return os;
}

std::ostream& print_data_name(CompoundNounList& cl, int t, std::ostream& os) { // tの値に合致するものの名前を出力
  std::multimap<std::string, Context> list = cl.list;
  for (std::multimap<std::string, Context>::iterator it = list.begin(); it != list.end(); ++it) {
	if ((*it).second.t == t) {
	  os << (*it).second.get_words() << std::endl;
	}
  }
  return os;
}


int main(int argc, char *argv[]) {
  using namespace std;
  using namespace boost;
  using namespace boost::filesystem;

  CompoundNounList cl;

  // 青空文庫を読み込ませる
  // 教師データの作成(負例)
  cout << "start reading 青空文庫..." << endl;
  std::string dir = "/Users/syou6162/dbcls/aozora/";
  path fullPath = complete(path(dir, native));
  directory_iterator end;
  int i = 0;
  for (directory_iterator it(fullPath); it !=end; ++it) {
	cout << (dir + it->leaf()) << endl;
	std::ifstream fis((dir + it->leaf()).c_str());
	string str;
	char c;
	while (fis.get(c)) {
	  str.push_back(c);
	}
	cl.add_text(str, -1); // -1は負例
	i++;
	if (i > 100) {
	  break;
	}
  }
  cout << "finish reading 青空文庫..." << endl;

  // 蛋白質核酸酵素を読み込ませる
  cout << "start reading pne..." << endl;
  for (int year = 1985; year < 2000; ++year) {
	string dir = "/Users/syou6162/dbcls/pne/" + itos(year);
	path fullPath = complete(path(dir, native));
	directory_iterator end;
	for (directory_iterator it(fullPath); it !=end; ++it){
	  string file = dir + "/" + it->leaf();
	  cout << file << endl;
	  std::ifstream fis(file.c_str());
	  string str;
	  char c;
	  while (fis.get(c)) {
		str.push_back(c);
	  }
	  multimap<string, Context> result = cl.add_text(str, 0); // 0は学習データ
	  cl.register_for_map(result);
	}
  }
  cout << "finish reading pne..." << endl;

  // 教師データの作成(正例)
  cout << "start reading umls..." << endl;
  std::ifstream fis("/Users/syou6162/dbcls/umls/umls2ja.tab");
  string str;
  while (getline(fis, str)) {
	vector<string> strvec;
	boost::algorithm::split(strvec, str, boost::algorithm::is_any_of("\t"));
	pair<multimap<string, Context>::iterator, multimap<string, Context>::iterator> p
	  = cl.list.equal_range(strvec[2]); // 青空文庫、pneですでに登録されている文脈の情報から辞書引き
	multimap<string, Context> tmp;
	for( ; p.first != p.second ; p.first++ ) {
	  Context c = (p.first)->second;
	  c.t = 1; // 1は正例のデータ
	  tmp.insert(std::pair<std::string, Context>((p.first)->first, c));
	}
	for (multimap<string, Context>::iterator it = tmp.begin(); it != tmp.end(); ++it) {
	  cl.list.insert(pair<string, Context>((*it).first, (*it).second));
	}
  }
  cout << "firstinish reading umls..." << endl;
  cl.print(std::cout);
  std::ofstream result("./result.txt");
  print_data_info(cl, result);
  std::ofstream name("./name.txt");
  print_data_name(cl, 0, name);
  return 0;
}