4.2. index.man.cc
A program to process man pages and generate indexing information from them.
TODO: do something sensible with the .so ones.
#include "doc.hh"
#include <cstdio>
#include <cctype>
#include <cassert>
#include <popt.h>
#include <dirent.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <errno.h>
using namespace libdoc;
The default list of paths in which to look for man pages.
const static char *manpaths_default[] =
{
"/usr/man",
"/usr/local/man",
0
};
const string FORMAT_MAN = "man";
The actual paths which will be searched will be stored in here by
`find_paths'.
static vector <string> manpaths_actual;
The name of the porgram, taken from argv[0].
const char *progname = 0;
Variables which are set by `decode_options' to describe the options
which were present. TODO: move option_stoplist to `docd'.
static int option_dryrun = 0;
static int option_showdefaultpaths = 0;
const static char *option_file = 0;
static int option_showfiles = 0;
static const char *option_host = 0;
const static char *option_manpath = 0;
static int option_port = DEFAULT_PORT;
static int option_showpaths = 0;
const static char *option_stoplist = 0;
static int option_verbose = 0;
static DocumentSender *current_document = 0;
The file to which indexing information is being written, which will usually
be the socket connected to `docd'.
FILE *dest_file;
Prototypes for functions declared below.
bool decode_options (int argc, const char **argv);
void find_paths ();
void search_paths ();
void search_raw_file (const string &filename);
void parse_manpage (const string &path, FILE *file);
bool is_word_start (unsigned char c);
bool is_word_middle (unsigned char c);
The table of command line options.
poptOption options_table[] =
{
POPT_AUTOHELP
{ "dry-run", 'd', POPT_ARG_NONE, &option_dryrun, 0,
"No information is sent to the daemon, it is printed to stdout instead",
0 },
{ "show-default-paths", 'D', POPT_ARG_NONE, &option_showdefaultpaths, 0,
"Just print the inbuilt default manpaths", 0 },
{ "file", 'f', POPT_ARG_STRING, &option_file, 0,
"Specify a single file to index", "FILENAME" },
{ "show-files", 'F', POPT_ARG_NONE, &option_showfiles, 0,
"Just print the names of files which would be indexed", 0 },
{ "host", 'h', POPT_ARG_STRING, &option_host, 0,
"Specify the IP address of the host to connect to", "HOSTNAME" },
{ "man-path", 'm', POPT_ARG_STRING, &option_manpath, 0,
"Specify a path under which to search for man pages", "PATH" },
{ "port", 'p', POPT_ARG_INT, &option_port, 0,
"Specify the number of the port to connect to", "PORT" },
{ "show-paths", 'P', POPT_ARG_NONE, &option_showpaths, 0,
"Just print the paths which will be searched for man pages", 0 },
{ "stoplist", 's', POPT_ARG_STRING, &option_stoplist, 0,
"Specify a file from which to read a list of stopwords", "FILENAME" },
{ "verbose", 'V', POPT_ARG_NONE, &option_verbose, 0,
"Print more information about what the program is doing", 0 },
{ NULL, 0, 0, NULL, 0, NULL, NULL }
};
int
main (int argc, const char **argv)
{
try
{
if (!decode_options (argc, argv))
return 1;
if (option_showdefaultpaths)
{
for (const char **p = manpaths_default; *p; ++p)
puts (*p);
return 0;
}
if (option_file)
search_raw_file (option_file);
else
{
find_paths();
if (option_showpaths)
{
for (unsigned int i = 0; i < manpaths_actual.size(); ++i)
puts (manpaths_actual[i].c_str());
return 0;
}
else
search_paths();
}
}
catch (ExceptionSystemError e)
{
fputs (e.explain().c_str(), stderr);
return 1;
}
return 0;
}
Set progname and decode the command line options, using the popt
library.
Returns `false' if there is an error with the options.
bool
decode_options (int argc, const char **argv)
{
progname = argv[0];
poptContext context = poptGetContext ("index.man", argc, argv,
options_table, 0);
int rc = poptGetNextOpt (context);
if (rc < -1)
{
fprintf (stderr, "%s: error in command line options.\n", progname);
return false;
}
If the host IP address wasn't specified, set a default.
if (option_host == 0)
option_host = strdup (inet_ntoa (find_hostip()));
if (option_dryrun || option_showfiles)
dest_file = stdout;
else
{
int sock = socket (PF_INET, SOCK_STREAM, 0);
if (sock == -1)
throw ExceptionSystemError ("can't open docd's socket");
sockaddr_in addr;
addr.sin_family = AF_INET;
addr.sin_port = htons (option_port);
addr.sin_addr.s_addr = inet_addr (option_host);
int rc = connect (sock, (sockaddr *) &addr, sizeof (addr));
if (rc == -1)
throw ExceptionSystemError ("can't connect to docd's socket");
dest_file = fdopen (sock, "w");
if (dest_file == NULL)
throw ExceptionSystemError ("can't fdopen the socket to docd");
}
return true;
}
If possible, work out which paths should be searched for man pages. If the
information cannot be obtained by running the `manpath' program (e.g.,
if it isn't installed) then the inbuilt list of default paths used used
instead.
The results are stored in manpaths_actual.
void
find_paths ()
{
if (option_manpath)
manpaths_actual.push_back (option_manpath);
else
{
FILE *manpath = popen ("manpath", "r");
if (manpath ≠ NULL)
{
Read the output from `manpath'.
string paths;
int c;
while ((c = fgetc (manpath)) ≠ EOF)
paths += char (c);
pclose (manpath);
Extract the list of paths, which are seperated by colons.
chomp (paths);
split (manpaths_actual, paths, ':');
}
else
{
for (const char **p = manpaths_default; *p; ++p)
manpaths_actual.push_back (*p);
}
}
}
This is a rather hairy function.
It might be better to write something like `popen' which doesn't use the
shell.
void
search_paths ()
{
for (size_t i = 0; i < manpaths_actual.size(); ++i)
{
DIR *topdir = opendir (manpaths_actual[i].c_str());
if (topdir == NULL)
fprintf (stderr, "%s: can't read directory `%s':\n%s", progname,
manpaths_actual[i].c_str(), strerror (errno));
else
{
dirent *subdir = readdir (topdir);
while (subdir)
{
if (strncmp (subdir->d_name, "man", 3) == 0 && subdir->d_name[3])
{
string mandirname = manpaths_actual[i].c_str();
mandirname += '/';
mandirname += subdir->d_name;
DIR *mandir = opendir (mandirname.c_str());
if (mandir == NULL)
fprintf (stderr, "%s: can't read directory `%s':\n%s",
progname, mandirname.c_str(), strerror (errno));
else
{
dirent *manfile = readdir (mandir);
while (manfile)
{
if (manfile->d_name[0] ≠ '.')
{
string filename = mandirname;
filename += '/';
filename += manfile->d_name;
search_raw_file (filename);
}
manfile = readdir (mandir);
}
closedir (mandir);
}
}
subdir = readdir (topdir);
}
closedir (topdir);
}
}
}
void
search_raw_file (const string &filename)
{
if (option_showfiles)
puts (filename.c_str());
else
{
bool compr = false;
FILE *manstream;
string cmd;
if (check_extension (filename, ".gz"))
cmd = "zcat ";
else if (check_extension (filename, ".bz2"))
cmd = "bzcat ";
if (cmd.empty())
manstream = fopen (filename.c_str(), "r");
else
{
compr = true;
cmd += filename;
manstream = popen (cmd.c_str(), "r");
}
if (manstream == NULL)
fprintf (stderr, "%s: can't read file `%s'.\n", progname,
filename.c_str());
else
{
parse_manpage (filename, manstream);
if (compr)
pclose (manstream);
else
fclose (manstream);
}
}
}
void
parse_manpage (const string &path, FILE *file)
{
if (option_verbose)
fprintf (stderr, "Parsing man page `%s'.\n", path.c_str());
enum State
{
S_STARTLINE, S_DOTCMD, S_MAYBE_TH, S_MAYBE_SH, S_BEFORE_SEC_TITLE,
S_SEC_TITLE, S_BEFORE_STRING, S_STRING, S_STRING_QUOTED,
S_IGNORE_TILL_SPACE, S_SPACE_OR_IGNORE, S_COMMENT, S_WORD,
S_SKIP_BACKSLASH, S_NAME_LINE_START, S_NAME_LINE
};
State s = S_STARTLINE;
string word;
vector <string> fields;
current_document = new DocumentSender (path, dest_file, FORMAT_MAN);
int c;
while ((c = fgetc (file)) ≠ EOF)
{
switch (s)
{
We are in this state only on the first character of a line.
case S_STARTLINE:
if (c == '\n')
;
if (c == '.')
s = S_DOTCMD;
else if (isspace (c))
s = S_SPACE_OR_IGNORE;
else if (is_word_start (c))
word = c, s = S_WORD;
break;
On the second character of a line if the first was a `.'.
case S_DOTCMD:
if (c == '\\' || c == '"')
s = S_COMMENT;
else if (c == 'T')
s = S_MAYBE_TH;
else if (c == 'S')
s = S_MAYBE_SH;
else if (c == '\n')
s = S_STARTLINE;
else if (isspace (c))
s = S_SPACE_OR_IGNORE;
else
s = S_IGNORE_TILL_SPACE;
break;
After reading `.T' at the start of the line, so we check to see
whether it is the `.TH' command.
case S_MAYBE_TH:
if (c == 'H')
fields.clear(), s = S_BEFORE_STRING;
else if (isspace (c))
s = S_SPACE_OR_IGNORE;
else
s = S_IGNORE_TILL_SPACE;
break;
case S_MAYBE_SH:
if (c == 'H')
s = S_BEFORE_SEC_TITLE;
else if (isspace (c))
s = S_SPACE_OR_IGNORE;
else
s = S_IGNORE_TILL_SPACE;
break;
case S_BEFORE_SEC_TITLE:
if (c == '\n')
s = S_STARTLINE;
else if (!isspace (c))
word = c, s = S_SEC_TITLE;
break;
case S_SEC_TITLE:
if (c == '\n')
{
if (word == "NAME" || word == "\"NAME\"")
s = S_NAME_LINE_START;
else
s = S_STARTLINE;
}
else
word += c;
break;
Skip the whitespace before a string.
case S_BEFORE_STRING:
if (c == '\n')
{
string title, date, manual;
if (fields.size() ≥ 1)
{
for (size_t i = 0; i < fields[0].size(); ++i)
if (fields[0][i] ≠ '\\')
title += fields[0][i];
if (fields.size() ≥ 2)
{
title += '(';
title += fields[1];
title += ')';
if (fields.size() ≥ 3)
date = fields[2];
if (fields.size() ≥ 5)
manual = fields[4];
}
}
current_document->set_header_info (title, date, manual);
s = S_STARTLINE;
}
else if (c == '"')
word = "", s = S_STRING_QUOTED;
else if (!isspace (c))
word = c, s = S_STRING;
break;
Read a string which has no quotes. It is terminated by whitespace.
case S_STRING:
if (c == '\n')
ungetc (c, file);
if (isspace (c))
fields.push_back (word), s = S_BEFORE_STRING;
else
word += c;
break;
Read a string which is enclosed in double quotes.
case S_STRING_QUOTED:
if (c == '"')
{
fields.push_back (word), s = S_BEFORE_STRING;
}
else
word += c;
break;
Ignore everything until we find a whitespace character.
case S_IGNORE_TILL_SPACE:
if (c == '\n')
s = S_STARTLINE;
else if (isspace (c))
s = S_SPACE_OR_IGNORE;
break;
The last character was whitespace or something which we are
ignoring, so this might be the start of a word.
case S_SPACE_OR_IGNORE:
if (c == '\n')
s = S_STARTLINE;
else if (isspace (c))
;
else if (c == '\\')
word = "", s = S_SKIP_BACKSLASH;
else if (is_word_start (c))
word = c, s = S_WORD;
break;
In a line comment, terminated by a newline.
case S_COMMENT:
if (c == '\n')
s = S_STARTLINE;
break;
While in this state a word is being read into word, until
something which can't be part of it is encountered.
case S_WORD:
if (is_word_middle (c))
word += c;
else
{
current_document->add_word (word);
if (c == '\n')
s = S_STARTLINE;
else if (c == '\\')
word = "", s = S_SKIP_BACKSLASH;
else
s = S_SPACE_OR_IGNORE;
}
break;
Skip an escape sequence which started with a backslash. We expect
to find somehting like `\fI' or `\fR'. We store the characters
after the backslash in `word' to keep track. We also
recognize `\-' as the start of a word.
case S_SKIP_BACKSLASH:
if (word.size() == 2)
{
if (is_word_start (c))
word = c, s = S_WORD;
else if (c == '\n')
s = S_STARTLINE;
else if (c == '\\')
word = "";
else
s = S_SPACE_OR_IGNORE;
}
else if (word.empty() and c == 'f')
word = 'f';
else if (word.empty() and c == '-')
word = '-', s = S_WORD;
else if (not word.empty())
word += c;
else
s = S_SPACE_OR_IGNORE;
break;
case S_NAME_LINE_START:
if (c == '.')
s = S_IGNORE_TILL_SPACE;
else
word = c, s = S_NAME_LINE;
break;
case S_NAME_LINE:
if (c == '\n')
{
string title;
for (size_t i = 0; i < word.size(); ++i)
if (word[i] ≠ '\\')
title += word[i];
current_document->set_title (title);
s = S_STARTLINE;
}
else
word += c;
break;
This should never be executed.
default:
assert (false);
break;
}
}
delete current_document;
current_document = 0;
}
Return `true' if a character can be the first character of a word.
bool
is_word_start (unsigned char c)
{
return (isalpha (c) || c == '-');
}
Return `true' if a character can be part of a word.
bool
is_word_middle (unsigned char c)
{
return (isalnum (c) || c == '-' || c == '/' || c == '.' || c == '_' ||
c == '@');
}