4. Indexers

4.1. index.text.cc


4.1.1

A program to generate indexing information from plain text. This can be used to index anything which can be reasonably converted to plain text.

This has too much duplication with `index.man'.


4.1.2
#include "doc.hh"
#include <cstdio>
#include <cctype>
#include <cassert>
#include <popt.h>
#include <dirent.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <errno.h>
using namespace libdoc;

4.1.3
const string FORMAT_TEXT = "text";

4.1.4

The name of the porgram, taken from argv[0].

const char *progname = 0;

4.1.5

Variables which are set by `decode_options' to describe the options which were present. TODO: move option_stoplist to `docd'.

static int option_dryrun = 0;
const static char *option_file = 0;
const static char *option_format = 0;
static const char *option_host = 0;
static int option_port = DEFAULT_PORT;
const static char *option_stoplist = 0;
static int option_stdin = 0;
static int option_verbose = 0;

4.1.6
static DocumentSender *current_document = 0;

4.1.7

The file to which indexing information is being written, which will usually be the socket connected to `docd'.

FILE *dest_file;

4.1.8

Prototypes for functions declared below.

bool decode_options (int argc, const char **argv);
void parse_textfile (const string &path, FILE *file);
bool is_word_start (unsigned char c);
bool is_word_middle (unsigned char c);

4.1.9

The table of command line options.

poptOption options_table[] =
{
   POPT_AUTOHELP
   { "dry-run", 'd', POPT_ARG_NONE, &option_dryrun, 0,
     "No information is sent to the daemon, it is printed to stdout instead",
     0 },
   { "file", 'f', POPT_ARG_STRING, &option_file, 0,
     "Specify a single file to index", "FILENAME" },
   { "format", 'F', POPT_ARG_STRING, &option_format, 0,
     "Specify the name of the file format (default `text')", "FORMAT" },
   { "host", 'h', POPT_ARG_STRING, &option_host, 0,
      "Specify the IP address of the host to connect to", "HOSTNAME" },
   { "port", 'p', POPT_ARG_INT, &option_port, 0,
      "Specify the number of the port to connect to", "PORT" },
   { "stoplist", 's', POPT_ARG_STRING, &option_stoplist, 0,
     "Specify a file from which to read a list of stopwords", "FILENAME" },
   { "stdin", 'S', POPT_ARG_NONE, &option_stdin, 0,
     "Read from stdin but pretend its really the file given with -f", 0 },
   { "verbose", 'V', POPT_ARG_NONE, &option_verbose, 0,
     "Print more information about what the program is doing", 0 },
   { NULL, 0, 0, NULL, 0, NULL, NULL }
};

4.1.10
int
main (int argc, const char **argv)
{
   try
   {
      if (!decode_options (argc, argv))
         return 1;

      if (option_file)
      {
         if (option_file[0] ≠ '/')
         {
            fprintf (stderr, "%s: a full path must be used for the file.\n",
                     progname);
            return 1;
         }

         FILE *f = stdin;
         if (!option_stdin)
         {
            f = fopen (option_file, "r");
            if (!f)
            {
               fprintf (stderr, "%s: error opening input file `%s'.\n",
                        progname, option_file);
               return 2;
            }
         }

         parse_textfile (option_file, f);
      }
      else
      {
         fprintf (stderr, "%s: a file must be specified with `-f'.\n",
                  progname);
         return 1;
      }
   }
   catch (Exception e)
   {
      fputs (e.explain().c_str(), stderr);
      return 1;
   }

   return 0;
}

4.1.11

Set progname and decode the command line options, using the popt library.

Returns `false' if there is an error with the options.

bool
decode_options (int argc, const char **argv)
{
   progname = argv[0];
   poptContext context = poptGetContext ("index.text", argc, argv,
                                         options_table, 0);

   int rc = poptGetNextOpt (context);
   if (rc < -1)
   {
      fprintf (stderr, "%s: error in command line options.\n", progname);
      return false;
   }

4.1.12

If the host IP address wasn't specified, set a default.

   if (option_host == 0)
      option_host = strdup (inet_ntoa (find_hostip()));

4.1.13
   if (option_dryrun)
      dest_file = stdout;
   else
   {
      int sock = socket (PF_INET, SOCK_STREAM, 0);
      if (sock == -1)
         throw ExceptionSystemError ("can't open docd's socket");

      sockaddr_in addr;
      addr.sin_family = AF_INET;
      addr.sin_port = htons (option_port);
      addr.sin_addr.s_addr = inet_addr (option_host);
      int rc = connect (sock, (sockaddr *) &addr, sizeof (addr));
      if (rc == -1)
         throw ExceptionSystemError ("can't connect to docd's socket");

      dest_file = fdopen (sock, "w");
      if (dest_file == NULL)
         throw ExceptionSystemError ("can't fdopen the socket to docd");
   }

   return true;
}

4.1.14

This was derived from the manpage indexer, which is why it uses the full state machine gubbins with only two states.

void
parse_textfile (const string &path, FILE *file)
{
   if (option_verbose)
      fprintf (stderr, "Parsing text file `%s'.\n", path.c_str());

   enum State
   {
      S_START, S_WORD
   };
   State s = S_START;
   string word;

   current_document = new DocumentSender (path, dest_file,
                                 option_format ? option_format : FORMAT_TEXT);

   int c;
   while ((c = fgetc (file)) ≠ EOF)
   {
      switch (s)
      {

4.1.15

We are in this state when we start and when we are not in the other state.

       case S_START:
         if (is_word_start (c))
            word = c, s = S_WORD;
         break;

4.1.16

While in this state a word is being read into word, until something which can't be part of it is encountered.

       case S_WORD:
         if (is_word_middle (c))
            word += c;
         else
         {
            current_document->add_word (word);
            s = S_START;
         }
         break;

4.1.17

This should never be executed.

       default:
         assert (false);
         break;
      }
   }

   delete current_document;
   current_document = 0;
}

4.1.18

Return `true' if a character can be the first character of a word.

bool
is_word_start (unsigned char c)
{
   return (isalpha (c) || c == '-');
}

4.1.19

Return `true' if a character can be part of a word.

bool
is_word_middle (unsigned char c)
{
   return (isalnum (c) || c == '-' || c == '/' || c == '.' || c == '_' ||
           c == '@');
}