Project: finder, src: main.cc


メインロジック。各種フラグはソースコードを参照のこと。
#include "Finder.h"
#include "docid.h"
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <iostream>
#include <fstream>
#include <string.h>
#include <string>
#include <ctype.h>
#include <sys/time.h>

class Timer {
 private:
  struct timeval startTime, stopTime;
 public:
  Timer() {
    gettimeofday(&startTime, 0);
  }
  double stop() {
    gettimeofday(&stopTime, 0);
    return watch();
  }
  double watch() {
    double r = stopTime.tv_sec - startTime.tv_sec;
    double rs = stopTime.tv_usec - startTime.tv_usec;
    r = r + rs / 1000.0 / 1000.0;
    return r;
  }
};

void createArchive(char *journal, char *arfile);
void createTrMatrix(char *arfile, char *trfile, bool bTrunc, unsigned long long minid, unsigned long long maxid);
void searchWord(char *arfile, char *trfile, char *searchString, bool bHTML);
void trim(std::string &str);

void getRange(char *arg, unsigned long long &minid, unsigned long long &maxid) {
  std::string rangeStr(arg);
  unsigned int pos = rangeStr.find(",", 0);
  if (pos == (unsigned int)std::string::npos) {
    minid = (unsigned long long)atoll(rangeStr.c_str());
  } else  {
    std::string minStr(rangeStr.substr(0, pos));
    std::string maxStr(rangeStr.substr(pos + 1));
    minid = (unsigned long long)atoll(minStr.c_str());
    maxid = (unsigned long long)atoll(maxStr.c_str());
  }
}

int main(int argc, char **argv) {
  int opt;
  bool bTrunc = false;
  bool bDelta = false;
  bool bCreateTrMatrix = false;
  bool bWorked = false;
  bool bHTML = false;
  char *journal = 0;
  char *arfile = strdup("docs.dbarch");
  char *trfile = strdup("docs.trfile");
  char *searchString = 0;
  unsigned long long minid = 0;
  unsigned long long maxid = (unsigned long long)-1;

  while ((opt = getopt(argc, argv, "cdhj:ia:t:s:m:")) != -1) {
    switch(opt) {
    case 'c':
      bCreateTrMatrix = true;
      break;
    case 'd':
      bDelta = true;
      break;
    case 'i':
      bTrunc = true;
      break;
    case 'j':
      journal = optarg;
      break;
    case 'a':
      arfile = optarg;
      break;
    case 't':
      trfile = optarg;
      break;
    case 's':
      searchString = optarg;
      break;
    case 'm':
      getRange(optarg, minid, maxid);
    case 'h':
      bHTML = true;
      break;
    }
  }

  if (journal) {
    std::cout << "creating document archives..." << std::endl;
    Timer watch;
    createArchive(journal, arfile);
    std::cout << "elapsed time(sec):" << watch.stop() << std::endl;
    bWorked = true;
  }
  if (bCreateTrMatrix) {
    std::cout << "creating transpose matrix..." << std::endl;
    Timer watch;
    createTrMatrix(arfile, trfile, bTrunc, minid, maxid);
    std::cout << "elapsed time(sec):" << watch.stop() << std::endl;
    bWorked = true;
  }
  if (searchString) {
    std::cout << "start to search specified words..." << std::endl;
    Timer watch;
    searchWord(arfile, trfile, searchString, bHTML);
    std::cout << "elapsed time(sec):" << watch.stop() << std::endl;
    bWorked = true;
  }
  if (!bWorked) {
      std::cout << "usage: finder -c -i -j <journal> -a <text archive> -t <trmatrix file> -m <minid>,<maxid> -s <search word>" << std::endl;
  }
  return 0;
} 

void createArchive(char *journal, char *arfile) {
  std::ifstream fis(journal);
  int fd = open(arfile, O_RDWR | O_CREAT | O_TRUNC, 0644);
  std::string line;
  while (std::getline(fis, line)) {
    trim(line);
    unsigned long long id = createDocId(fd, line.data(), line.length());
    if ((id % 10000ULL) == 0) {
      std::cout << "archived No." << id << " document" << std::endl;
    }
  }
  close(fd);
}

void createTrMatrix(char *arfile, char *trfile, bool bTrunc, unsigned long long minid, unsigned long long maxid) {
  int afd = open(arfile, O_RDONLY);
  int flags = O_RDWR | O_CREAT;
  if (bTrunc) {
    flags |= O_TRUNC;
  }
  int tfd = open(trfile, flags, 0644);
  unsigned long long maxidReal = getLastDocId(afd);
  if (maxid > maxidReal) {
    maxid = maxidReal;
  }
  {
    Finder finder(afd, tfd);
    for (unsigned long long id = minid; id < maxid; ++id) {
      if ((id % 10000ULL) == 0) {
        std::cout << "adding " << id << std::endl;
      }
      finder.add(id);
    }
    finder.flush();
  }
  close(afd);
  close(tfd);
}

void showHex(int x) {
  static const char *hex[] = { "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "a", "b", "c", "d", "e", "f" };
  std::cout << "%" << hex[(x >> 4) & 15] << hex[x & 15];
}

bool isAllowedChar(int cc) {
  static int allowed[] = { '_', '-', '=', '~', '*', '.', ',', '!', '$' };
  for (int i = 0; i < sizeof(allowed)/sizeof(allowed[0]); ++i) {
    if (allowed[i] == cc) return true;
  }
  return false;
}

void showHref(const char *title, int len) {
  std::cout << "<a target=\"finder\" href=\"https://ja.wikipedia.org/wiki/";
  for (const char *t = title; len > 0; ++t, --len) {
    int cc = *t & 255;
    if (cc == ' ') {
      std::cout << "+";
    } else if (isalnum(cc) || isAllowedChar(cc)) {
      std::cout << *t;
    } else {
      showHex(cc);
    }
  }
  std::cout << "\">" << title << "</a>";
}

void searchWord(char *arfile, char *trfile, char *searchString, bool bHTML) {
  Finder finder(arfile, trfile, false);
  std::list<Document> result = finder.find(searchString, strlen(searchString));
  std::cout << result.size() << " documents matched:" << std::endl;
  int afd = finder.getDocfd();
  for (std::list<Document>::iterator i = result.begin(); i != result.end(); ++i) {
    unsigned long long id = (*i).getDocid();
    int len;
    char *title = getDocFromId(afd, id, &len);
    std::cout << id << "(" << (*i).getBookmarks().size() << "):";
    if (title) {
      if (bHTML) {
        showHref(title, len);
      } else {
        std::cout << title;
      }
      free(title);
    } else {
      std::cout << "(no title)";
    }
    std::cout << std::endl;
  }
}

void trim(std::string &s) {
  int len = s.length();
  while (len > 0) {
    if (!isspace(s.at(len - 1))) break;
    --len;
  }
  std::string news(s.data(), len);
  s = news;
}