4.2. index.man.cc


4.2.1

A program to process man pages and generate indexing information from them. TODO: do something sensible with the .so ones.


4.2.2
#include "doc.hh"
#include <cstdio>
#include <cctype>
#include <cassert>
#include <popt.h>
#include <dirent.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <errno.h>
using namespace libdoc;

4.2.3

The default list of paths in which to look for man pages.

const static char *manpaths_default[] =
{
   "/usr/man",
   "/usr/local/man",
   0
};

4.2.4
const string FORMAT_MAN = "man";

4.2.5

The actual paths which will be searched will be stored in here by `find_paths'.

static vector <string> manpaths_actual;

4.2.6

The name of the porgram, taken from argv[0].

const char *progname = 0;

4.2.7

Variables which are set by `decode_options' to describe the options which were present. TODO: move option_stoplist to `docd'.

static int option_dryrun = 0;
static int option_showdefaultpaths = 0;
const static char *option_file = 0;
static int option_showfiles = 0;
static const char *option_host = 0;
const static char *option_manpath = 0;
static int option_port = DEFAULT_PORT;
static int option_showpaths = 0;
const static char *option_stoplist = 0;
static int option_verbose = 0;

4.2.8
static DocumentSender *current_document = 0;

4.2.9

The file to which indexing information is being written, which will usually be the socket connected to `docd'.

FILE *dest_file;

4.2.10

Prototypes for functions declared below.

bool decode_options (int argc, const char **argv);
void find_paths ();
void search_paths ();
void search_raw_file (const string &filename);
void parse_manpage (const string &path, FILE *file);
bool is_word_start (unsigned char c);
bool is_word_middle (unsigned char c);

4.2.11

The table of command line options.

poptOption options_table[] =
{
   POPT_AUTOHELP
   { "dry-run", 'd', POPT_ARG_NONE, &option_dryrun, 0,
     "No information is sent to the daemon, it is printed to stdout instead",
     0 },
   { "show-default-paths", 'D', POPT_ARG_NONE, &option_showdefaultpaths, 0,
     "Just print the inbuilt default manpaths", 0 },
   { "file", 'f', POPT_ARG_STRING, &option_file, 0,
     "Specify a single file to index", "FILENAME" },
   { "show-files", 'F', POPT_ARG_NONE, &option_showfiles, 0,
     "Just print the names of files which would be indexed", 0 },
   { "host", 'h', POPT_ARG_STRING, &option_host, 0,
      "Specify the IP address of the host to connect to", "HOSTNAME" },
   { "man-path", 'm', POPT_ARG_STRING, &option_manpath, 0,
     "Specify a path under which to search for man pages", "PATH" },
   { "port", 'p', POPT_ARG_INT, &option_port, 0,
      "Specify the number of the port to connect to", "PORT" },
   { "show-paths", 'P', POPT_ARG_NONE, &option_showpaths, 0,
     "Just print the paths which will be searched for man pages", 0 },
   { "stoplist", 's', POPT_ARG_STRING, &option_stoplist, 0,
     "Specify a file from which to read a list of stopwords", "FILENAME" },
   { "verbose", 'V', POPT_ARG_NONE, &option_verbose, 0,
     "Print more information about what the program is doing", 0 },
   { NULL, 0, 0, NULL, 0, NULL, NULL }
};

4.2.12
int
main (int argc, const char **argv)
{
   try
   {
      if (!decode_options (argc, argv))
         return 1;

      if (option_showdefaultpaths)
      {
         for (const char **p = manpaths_default; *p; ++p)
            puts (*p);
         return 0;
      }

      if (option_file)
         search_raw_file (option_file);
      else
      {
         find_paths();

         if (option_showpaths)
         {
            for (unsigned int i = 0; i < manpaths_actual.size(); ++i)
               puts (manpaths_actual[i].c_str());
            return 0;
         }
         else
            search_paths();
      }
   }
   catch (ExceptionSystemError e)
   {
      fputs (e.explain().c_str(), stderr);
      return 1;
   }

   return 0;
}

4.2.13

Set progname and decode the command line options, using the popt library.

Returns `false' if there is an error with the options.

bool
decode_options (int argc, const char **argv)
{
   progname = argv[0];
   poptContext context = poptGetContext ("index.man", argc, argv,
                                         options_table, 0);

   int rc = poptGetNextOpt (context);
   if (rc < -1)
   {
      fprintf (stderr, "%s: error in command line options.\n", progname);
      return false;
   }

4.2.14

If the host IP address wasn't specified, set a default.

   if (option_host == 0)
      option_host = strdup (inet_ntoa (find_hostip()));

4.2.15
   if (option_dryrun || option_showfiles)
      dest_file = stdout;
   else
   {
      int sock = socket (PF_INET, SOCK_STREAM, 0);
      if (sock == -1)
         throw ExceptionSystemError ("can't open docd's socket");

      sockaddr_in addr;
      addr.sin_family = AF_INET;
      addr.sin_port = htons (option_port);
      addr.sin_addr.s_addr = inet_addr (option_host);
      int rc = connect (sock, (sockaddr *) &addr, sizeof (addr));
      if (rc == -1)
         throw ExceptionSystemError ("can't connect to docd's socket");

      dest_file = fdopen (sock, "w");
      if (dest_file == NULL)
         throw ExceptionSystemError ("can't fdopen the socket to docd");
   }

   return true;
}

4.2.16

If possible, work out which paths should be searched for man pages. If the information cannot be obtained by running the `manpath' program (e.g., if it isn't installed) then the inbuilt list of default paths used used instead.

The results are stored in manpaths_actual.

void
find_paths ()
{
   if (option_manpath)
      manpaths_actual.push_back (option_manpath);
   else
   {
      FILE *manpath = popen ("manpath", "r");

      if (manpath ≠ NULL)
      {

4.2.17

Read the output from `manpath'.

         string paths;
         int c;
         while ((c = fgetc (manpath)) ≠ EOF)
            paths += char (c);
         pclose (manpath);

4.2.18

Extract the list of paths, which are seperated by colons.

         chomp (paths);
         split (manpaths_actual, paths, ':');
      }
      else
      {
         for (const char **p = manpaths_default; *p; ++p)
            manpaths_actual.push_back (*p);
      }
   }
}

4.2.19

This is a rather hairy function. It might be better to write something like `popen' which doesn't use the shell.

void
search_paths ()
{
   for (size_t i = 0; i < manpaths_actual.size(); ++i)
   {
      DIR *topdir = opendir (manpaths_actual[i].c_str());

      if (topdir == NULL)
         fprintf (stderr, "%s: can't read directory `%s':\n%s", progname,
                  manpaths_actual[i].c_str(), strerror (errno));
      else
      {
         dirent *subdir = readdir (topdir);
         while (subdir)
         {
            if (strncmp (subdir->d_name, "man", 3) == 0 && subdir->d_name[3])
            {
               string mandirname = manpaths_actual[i].c_str();
               mandirname += '/';
               mandirname += subdir->d_name;
               DIR *mandir = opendir (mandirname.c_str());

               if (mandir == NULL)
                  fprintf (stderr, "%s: can't read directory `%s':\n%s",
                           progname, mandirname.c_str(), strerror (errno));
               else
               {
                  dirent *manfile = readdir (mandir);
                  while (manfile)
                  {
                     if (manfile->d_name[0] ≠ '.')
                     {
                        string filename = mandirname;
                        filename += '/';
                        filename += manfile->d_name;

                        search_raw_file (filename);
                     }

                     manfile = readdir (mandir);
                  }

                  closedir (mandir);
               }
            }

            subdir = readdir (topdir);
         }

         closedir (topdir);
      }
   }
}

4.2.20
void
search_raw_file (const string &filename)
{
   if (option_showfiles)
      puts (filename.c_str());
   else
   {
      // TODO: use the libraries instead. It'll be faster.
      bool compr = false;
      FILE *manstream;
      string cmd;

      if (check_extension (filename, ".gz"))
         cmd = "zcat ";
      else if (check_extension (filename, ".bz2"))
         cmd = "bzcat ";
      if (cmd.empty())
         manstream = fopen (filename.c_str(), "r");
      else
      {
         compr = true;
         cmd += filename;
         manstream = popen (cmd.c_str(), "r");
      }

      if (manstream == NULL)
         fprintf (stderr, "%s: can't read file `%s'.\n", progname,
                  filename.c_str());
      else
      {
         parse_manpage (filename, manstream);

         if (compr)
            pclose (manstream);
         else
            fclose (manstream);
      }
   }
}

4.2.21
void
parse_manpage (const string &path, FILE *file)
{
   if (option_verbose)
      fprintf (stderr, "Parsing man page `%s'.\n", path.c_str());

   enum State
   {
      S_STARTLINE, S_DOTCMD, S_MAYBE_TH, S_MAYBE_SH, S_BEFORE_SEC_TITLE,
      S_SEC_TITLE, S_BEFORE_STRING, S_STRING, S_STRING_QUOTED,
      S_IGNORE_TILL_SPACE, S_SPACE_OR_IGNORE, S_COMMENT, S_WORD,
      S_SKIP_BACKSLASH, S_NAME_LINE_START, S_NAME_LINE
   };
   State s = S_STARTLINE;
   string word;
   vector <string> fields;

   current_document = new DocumentSender (path, dest_file, FORMAT_MAN);

   int c;
   while ((c = fgetc (file)) ≠ EOF)
   {
      switch (s)
      {

4.2.22

We are in this state only on the first character of a line.

       case S_STARTLINE:
         if (c == '\n')
            /* do nothing */;
         if (c == '.')
            s = S_DOTCMD;
         else if (isspace (c))
            s = S_SPACE_OR_IGNORE;
         else if (is_word_start (c))
            word = c, s = S_WORD;
         break;

4.2.23

On the second character of a line if the first was a `.'.

       case S_DOTCMD:
         if (c == '\\' || c == '"')
            s = S_COMMENT;
         else if (c == 'T')
            s = S_MAYBE_TH;
         else if (c == 'S')
            s = S_MAYBE_SH;
         else if (c == '\n')
            s = S_STARTLINE;
         else if (isspace (c))
            s = S_SPACE_OR_IGNORE;
         else
            s = S_IGNORE_TILL_SPACE;
         break;

4.2.24

After reading `.T' at the start of the line, so we check to see whether it is the `.TH' command.

       case S_MAYBE_TH:
         if (c == 'H')
            fields.clear(), s = S_BEFORE_STRING;
         else if (isspace (c))
            s = S_SPACE_OR_IGNORE;
         else
            s = S_IGNORE_TILL_SPACE;
         break;

4.2.25
       case S_MAYBE_SH:
         if (c == 'H')
            s = S_BEFORE_SEC_TITLE;
         else if (isspace (c))
            s = S_SPACE_OR_IGNORE;
         else
            s = S_IGNORE_TILL_SPACE;
         break;

       case S_BEFORE_SEC_TITLE:
         if (c == '\n')
            s = S_STARTLINE;
         else if (!isspace (c))
            word = c, s = S_SEC_TITLE;
         break;

       case S_SEC_TITLE:
         if (c == '\n')
         {
            if (word == "NAME" || word == "\"NAME\"")
               s = S_NAME_LINE_START;
            else
               s = S_STARTLINE;
         }
         else
            word += c;
         break;

4.2.26

Skip the whitespace before a string.

       case S_BEFORE_STRING:
         if (c == '\n')
         {
            string title, date, manual;
            if (fields.size() ≥ 1)
            {
               for (size_t i = 0; i < fields[0].size(); ++i)
                  if (fields[0][i] ≠ '\\')
                     title += fields[0][i];

               if (fields.size() ≥ 2)
               {
                  title += '(';
                  title += fields[1];
                  title += ')';

                  if (fields.size() ≥ 3)
                     date = fields[2];
                  if (fields.size() ≥ 5)
                     manual = fields[4];
               }
            }
            current_document->set_header_info (title, date, manual);

            s = S_STARTLINE;
         }
         else if (c == '"')
            word = "", s = S_STRING_QUOTED;
         else if (!isspace (c))
            word = c, s = S_STRING;
         break;

4.2.27

Read a string which has no quotes. It is terminated by whitespace.

       case S_STRING:
         if (c == '\n')
            ungetc (c, file);
         if (isspace (c))
            fields.push_back (word), s = S_BEFORE_STRING;
         else
            word += c;
         break;

4.2.28

Read a string which is enclosed in double quotes.

       case S_STRING_QUOTED:
         if (c == '"')
         {
            fields.push_back (word), s = S_BEFORE_STRING;
         }
         else
            word += c;
         break;

4.2.29

Ignore everything until we find a whitespace character.

       case S_IGNORE_TILL_SPACE:
         if (c == '\n')
            s = S_STARTLINE;
         else if (isspace (c))
            s = S_SPACE_OR_IGNORE;
         break;

4.2.30

The last character was whitespace or something which we are ignoring, so this might be the start of a word.

       case S_SPACE_OR_IGNORE:
         if (c == '\n')
            s = S_STARTLINE;
         else if (isspace (c))
            /* do nothing */;
         else if (c == '\\')
            word = "", s = S_SKIP_BACKSLASH;
         else if (is_word_start (c))
            word = c, s = S_WORD;
         break;

4.2.31

In a line comment, terminated by a newline.

       case S_COMMENT:
         if (c == '\n')
            s = S_STARTLINE;
         break;

4.2.32

While in this state a word is being read into word, until something which can't be part of it is encountered.

       case S_WORD:
         if (is_word_middle (c))
            word += c;
         else
         {
            current_document->add_word (word);
            if (c == '\n')
               s = S_STARTLINE;
            else if (c == '\\')
               word = "", s = S_SKIP_BACKSLASH;
            else
               s = S_SPACE_OR_IGNORE;
         }
         break;

4.2.33

Skip an escape sequence which started with a backslash. We expect to find somehting like `\fI' or `\fR'. We store the characters after the backslash in `word' to keep track. We also recognize `\-' as the start of a word.

       case S_SKIP_BACKSLASH:
         if (word.size() == 2)
         {
            if (is_word_start (c))
               word = c, s = S_WORD;
            else if (c == '\n')
               s = S_STARTLINE;
            else if (c == '\\')
               word = "";
            else
               s = S_SPACE_OR_IGNORE;
         }
         else if (word.empty() and c == 'f')
            word = 'f';
         else if (word.empty() and c == '-')
            word = '-', s = S_WORD;
         else if (not word.empty())
            word += c;
         else
            s = S_SPACE_OR_IGNORE;
         break;

       case S_NAME_LINE_START:
         if (c == '.')
            s = S_IGNORE_TILL_SPACE;
         else
            word = c, s = S_NAME_LINE;
         break;

       case S_NAME_LINE:
         if (c == '\n')
         {
            // TODO: add a section here?
            string title;
            for (size_t i = 0; i < word.size(); ++i)
               if (word[i] ≠ '\\')
                  title += word[i];

            current_document->set_title (title);
            s = S_STARTLINE;
         }
         else
            word += c;
         break;

4.2.34

This should never be executed.

       default:
         assert (false);
         break;
      }
   }

   delete current_document;
   current_document = 0;
}

4.2.35

Return `true' if a character can be the first character of a word.

bool
is_word_start (unsigned char c)
{
   return (isalpha (c) || c == '-');
}

4.2.36

Return `true' if a character can be part of a word.

bool
is_word_middle (unsigned char c)
{
   return (isalnum (c) || c == '-' || c == '/' || c == '.' || c == '_' ||
           c == '@');
}