4. Indexers
4.1. index.text.cc
A program to generate indexing information from plain text. This can be used
to index anything which can be reasonably converted to plain text.
This has too much duplication with `index.man'.
#include "doc.hh"
#include <cstdio>
#include <cctype>
#include <cassert>
#include <popt.h>
#include <dirent.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <errno.h>
using namespace libdoc;
const string FORMAT_TEXT = "text";
The name of the porgram, taken from argv[0].
const char *progname = 0;
Variables which are set by `decode_options' to describe the options
which were present. TODO: move option_stoplist to `docd'.
static int option_dryrun = 0;
const static char *option_file = 0;
const static char *option_format = 0;
static const char *option_host = 0;
static int option_port = DEFAULT_PORT;
const static char *option_stoplist = 0;
static int option_stdin = 0;
static int option_verbose = 0;
static DocumentSender *current_document = 0;
The file to which indexing information is being written, which will usually
be the socket connected to `docd'.
FILE *dest_file;
Prototypes for functions declared below.
bool decode_options (int argc, const char **argv);
void parse_textfile (const string &path, FILE *file);
bool is_word_start (unsigned char c);
bool is_word_middle (unsigned char c);
The table of command line options.
poptOption options_table[] =
{
POPT_AUTOHELP
{ "dry-run", 'd', POPT_ARG_NONE, &option_dryrun, 0,
"No information is sent to the daemon, it is printed to stdout instead",
0 },
{ "file", 'f', POPT_ARG_STRING, &option_file, 0,
"Specify a single file to index", "FILENAME" },
{ "format", 'F', POPT_ARG_STRING, &option_format, 0,
"Specify the name of the file format (default `text')", "FORMAT" },
{ "host", 'h', POPT_ARG_STRING, &option_host, 0,
"Specify the IP address of the host to connect to", "HOSTNAME" },
{ "port", 'p', POPT_ARG_INT, &option_port, 0,
"Specify the number of the port to connect to", "PORT" },
{ "stoplist", 's', POPT_ARG_STRING, &option_stoplist, 0,
"Specify a file from which to read a list of stopwords", "FILENAME" },
{ "stdin", 'S', POPT_ARG_NONE, &option_stdin, 0,
"Read from stdin but pretend its really the file given with -f", 0 },
{ "verbose", 'V', POPT_ARG_NONE, &option_verbose, 0,
"Print more information about what the program is doing", 0 },
{ NULL, 0, 0, NULL, 0, NULL, NULL }
};
int
main (int argc, const char **argv)
{
try
{
if (!decode_options (argc, argv))
return 1;
if (option_file)
{
if (option_file[0] ≠ '/')
{
fprintf (stderr, "%s: a full path must be used for the file.\n",
progname);
return 1;
}
FILE *f = stdin;
if (!option_stdin)
{
f = fopen (option_file, "r");
if (!f)
{
fprintf (stderr, "%s: error opening input file `%s'.\n",
progname, option_file);
return 2;
}
}
parse_textfile (option_file, f);
}
else
{
fprintf (stderr, "%s: a file must be specified with `-f'.\n",
progname);
return 1;
}
}
catch (Exception e)
{
fputs (e.explain().c_str(), stderr);
return 1;
}
return 0;
}
Set progname and decode the command line options, using the popt
library.
Returns `false' if there is an error with the options.
bool
decode_options (int argc, const char **argv)
{
progname = argv[0];
poptContext context = poptGetContext ("index.text", argc, argv,
options_table, 0);
int rc = poptGetNextOpt (context);
if (rc < -1)
{
fprintf (stderr, "%s: error in command line options.\n", progname);
return false;
}
If the host IP address wasn't specified, set a default.
if (option_host == 0)
option_host = strdup (inet_ntoa (find_hostip()));
if (option_dryrun)
dest_file = stdout;
else
{
int sock = socket (PF_INET, SOCK_STREAM, 0);
if (sock == -1)
throw ExceptionSystemError ("can't open docd's socket");
sockaddr_in addr;
addr.sin_family = AF_INET;
addr.sin_port = htons (option_port);
addr.sin_addr.s_addr = inet_addr (option_host);
int rc = connect (sock, (sockaddr *) &addr, sizeof (addr));
if (rc == -1)
throw ExceptionSystemError ("can't connect to docd's socket");
dest_file = fdopen (sock, "w");
if (dest_file == NULL)
throw ExceptionSystemError ("can't fdopen the socket to docd");
}
return true;
}
This was derived from the manpage indexer, which is why it uses the full
state machine gubbins with only two states.
void
parse_textfile (const string &path, FILE *file)
{
if (option_verbose)
fprintf (stderr, "Parsing text file `%s'.\n", path.c_str());
enum State
{
S_START, S_WORD
};
State s = S_START;
string word;
current_document = new DocumentSender (path, dest_file,
option_format ? option_format : FORMAT_TEXT);
int c;
while ((c = fgetc (file)) ≠ EOF)
{
switch (s)
{
We are in this state when we start and when we are not in the other
state.
case S_START:
if (is_word_start (c))
word = c, s = S_WORD;
break;
While in this state a word is being read into word, until
something which can't be part of it is encountered.
case S_WORD:
if (is_word_middle (c))
word += c;
else
{
current_document->add_word (word);
s = S_START;
}
break;
This should never be executed.
default:
assert (false);
break;
}
}
delete current_document;
current_document = 0;
}
Return `true' if a character can be the first character of a word.
bool
is_word_start (unsigned char c)
{
return (isalpha (c) || c == '-');
}
Return `true' if a character can be part of a word.
bool
is_word_middle (unsigned char c)
{
return (isalnum (c) || c == '-' || c == '/' || c == '.' || c == '_' ||
c == '@');
}