Project: finder, src: main.cc
メインロジック。各種フラグはソースコードを参照のこと。
#include "Finder.h"
#include "docid.h"
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <iostream>
#include <fstream>
#include <string.h>
#include <string>
#include <ctype.h>
#include <sys/time.h>
class Timer {
private:
struct timeval startTime, stopTime;
public:
Timer() {
gettimeofday(&startTime, 0);
}
double stop() {
gettimeofday(&stopTime, 0);
return watch();
}
double watch() {
double r = stopTime.tv_sec - startTime.tv_sec;
double rs = stopTime.tv_usec - startTime.tv_usec;
r = r + rs / 1000.0 / 1000.0;
return r;
}
};
void createArchive(char *journal, char *arfile);
void createTrMatrix(char *arfile, char *trfile, bool bTrunc, unsigned long long minid, unsigned long long maxid);
void searchWord(char *arfile, char *trfile, char *searchString, bool bHTML);
void trim(std::string &str);
void getRange(char *arg, unsigned long long &minid, unsigned long long &maxid) {
std::string rangeStr(arg);
unsigned int pos = rangeStr.find(",", 0);
if (pos == (unsigned int)std::string::npos) {
minid = (unsigned long long)atoll(rangeStr.c_str());
} else {
std::string minStr(rangeStr.substr(0, pos));
std::string maxStr(rangeStr.substr(pos + 1));
minid = (unsigned long long)atoll(minStr.c_str());
maxid = (unsigned long long)atoll(maxStr.c_str());
}
}
int main(int argc, char **argv) {
int opt;
bool bTrunc = false;
bool bDelta = false;
bool bCreateTrMatrix = false;
bool bWorked = false;
bool bHTML = false;
char *journal = 0;
char *arfile = strdup("docs.dbarch");
char *trfile = strdup("docs.trfile");
char *searchString = 0;
unsigned long long minid = 0;
unsigned long long maxid = (unsigned long long)-1;
while ((opt = getopt(argc, argv, "cdhj:ia:t:s:m:")) != -1) {
switch(opt) {
case 'c':
bCreateTrMatrix = true;
break;
case 'd':
bDelta = true;
break;
case 'i':
bTrunc = true;
break;
case 'j':
journal = optarg;
break;
case 'a':
arfile = optarg;
break;
case 't':
trfile = optarg;
break;
case 's':
searchString = optarg;
break;
case 'm':
getRange(optarg, minid, maxid);
case 'h':
bHTML = true;
break;
}
}
if (journal) {
std::cout << "creating document archives..." << std::endl;
Timer watch;
createArchive(journal, arfile);
std::cout << "elapsed time(sec):" << watch.stop() << std::endl;
bWorked = true;
}
if (bCreateTrMatrix) {
std::cout << "creating transpose matrix..." << std::endl;
Timer watch;
createTrMatrix(arfile, trfile, bTrunc, minid, maxid);
std::cout << "elapsed time(sec):" << watch.stop() << std::endl;
bWorked = true;
}
if (searchString) {
std::cout << "start to search specified words..." << std::endl;
Timer watch;
searchWord(arfile, trfile, searchString, bHTML);
std::cout << "elapsed time(sec):" << watch.stop() << std::endl;
bWorked = true;
}
if (!bWorked) {
std::cout << "usage: finder -c -i -j <journal> -a <text archive> -t <trmatrix file> -m <minid>,<maxid> -s <search word>" << std::endl;
}
return 0;
}
void createArchive(char *journal, char *arfile) {
std::ifstream fis(journal);
int fd = open(arfile, O_RDWR | O_CREAT | O_TRUNC, 0644);
std::string line;
while (std::getline(fis, line)) {
trim(line);
unsigned long long id = createDocId(fd, line.data(), line.length());
if ((id % 10000ULL) == 0) {
std::cout << "archived No." << id << " document" << std::endl;
}
}
close(fd);
}
void createTrMatrix(char *arfile, char *trfile, bool bTrunc, unsigned long long minid, unsigned long long maxid) {
int afd = open(arfile, O_RDONLY);
int flags = O_RDWR | O_CREAT;
if (bTrunc) {
flags |= O_TRUNC;
}
int tfd = open(trfile, flags, 0644);
unsigned long long maxidReal = getLastDocId(afd);
if (maxid > maxidReal) {
maxid = maxidReal;
}
{
Finder finder(afd, tfd);
for (unsigned long long id = minid; id < maxid; ++id) {
if ((id % 10000ULL) == 0) {
std::cout << "adding " << id << std::endl;
}
finder.add(id);
}
finder.flush();
}
close(afd);
close(tfd);
}
void showHex(int x) {
static const char *hex[] = { "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "a", "b", "c", "d", "e", "f" };
std::cout << "%" << hex[(x >> 4) & 15] << hex[x & 15];
}
bool isAllowedChar(int cc) {
static int allowed[] = { '_', '-', '=', '~', '*', '.', ',', '!', '$' };
for (int i = 0; i < sizeof(allowed)/sizeof(allowed[0]); ++i) {
if (allowed[i] == cc) return true;
}
return false;
}
void showHref(const char *title, int len) {
std::cout << "<a target=\"finder\" href=\"https://ja.wikipedia.org/wiki/";
for (const char *t = title; len > 0; ++t, --len) {
int cc = *t & 255;
if (cc == ' ') {
std::cout << "+";
} else if (isalnum(cc) || isAllowedChar(cc)) {
std::cout << *t;
} else {
showHex(cc);
}
}
std::cout << "\">" << title << "</a>";
}
void searchWord(char *arfile, char *trfile, char *searchString, bool bHTML) {
Finder finder(arfile, trfile, false);
std::list<Document> result = finder.find(searchString, strlen(searchString));
std::cout << result.size() << " documents matched:" << std::endl;
int afd = finder.getDocfd();
for (std::list<Document>::iterator i = result.begin(); i != result.end(); ++i) {
unsigned long long id = (*i).getDocid();
int len;
char *title = getDocFromId(afd, id, &len);
std::cout << id << "(" << (*i).getBookmarks().size() << "):";
if (title) {
if (bHTML) {
showHref(title, len);
} else {
std::cout << title;
}
free(title);
} else {
std::cout << "(no title)";
}
std::cout << std::endl;
}
}
void trim(std::string &s) {
int len = s.length();
while (len > 0) {
if (!isspace(s.at(len - 1))) break;
--len;
}
std::string news(s.data(), len);
s = news;
}