5. Searching

5.1. daemon.cc


5.1.1

A daemon to manage the documentation index database. TODO: add a mutex for access to the DB.


5.1.2
#include "doc.hh"
#include "table.hh"
#include <string>
#include <algorithm>
#include <cstdlib>
#include <cstdio>
#include <cstring>
#include <cctype>
#include <pthread.h>
#include <popt.h>
#include <signal.h>
#include <errno.h>
#include <sys/wait.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <cassert>
using namespace libdoc;

const string default_dbpath = "/tmp/docd";
const string default_logfile = "/tmp/docd/docd.log";
const size_t WORD_SECLIST_INC = 16;

5.1.3

The name of the porgram, taken from argv[0].

const char *progname;
const char *option_dbpath = 0;
string option_dbpath_final;
const char *option_logfile = 0;
string option_logfile_final;
int option_verbose = 0;
int option_ping = 0;
int option_nodetach = 0;
int option_kill = 0;
int option_status = 0;
int option_port = DEFAULT_PORT;
char *option_host = 0;
volatile sig_atomic_t finish_processing = false;
bool options_client = false;

5.1.4

A file to send logging messages to.

FILE *logfile = 0;
FILE *errorfile = stderr;

5.1.5

A table of the tables which will be used to store the indexing information.

struct TableSpec
{
   const char *filename;
   const char *fields;
   PrimKeyType primkey;
   const char *indices;
   Table **var;
};

Table *table_format, *table_document, *table_section, *table_word;

const TableSpec table_specs[] =
{
   { "format", "sss", PRIMKEY_AUTO, "1:format_name", &table_format },
   { "document", "sis", PRIMKEY_AUTO, "1:document_filename", &table_document },
   { "section", "iss", PRIMKEY_AUTO, "3:section_reference", &table_section },
   { "word", "I", PRIMKEY_STRING, "", &table_word },
   { 0, 0, PRIMKEY_AUTO, 0, 0 }
};

5.1.6

A mutex which is used to lock the whole database (i.e., it is a very crude and inefficient locking mechanism).

pthread_mutex_t db_mutex;

5.1.7

The table of command line options.

poptOption options_table[] =
{
   POPT_AUTOHELP
   { "db-path", 'd', POPT_ARG_STRING, &option_dbpath, 0,
      "Specify the file to write log messages to", NULL },
   { "log-file", 'l', POPT_ARG_STRING, &option_logfile, 0,
      "Specify the file to write log messages to", NULL },
   { "verbose-log", 'V', POPT_ARG_NONE, &option_verbose, 0,
      "Write LOTS of information into the log file", NULL },
   { "kill-daemon", 'K', POPT_ARG_NONE, &option_kill, 0,
      "Sends a message to the daemon asking it to shut down", NULL },
   { "no-detach", 'D', POPT_ARG_NONE, &option_nodetach, 0,
      "Don't fork to detach from the terminal", NULL },
   { "ping", 'P', POPT_ARG_NONE, &option_ping, 0,
      "Send a ping request to the running daemon", NULL },
   { "status", 'S', POPT_ARG_NONE, &option_status, 0,
      "Print status information from the running daemon", NULL },
   { "port", 'p', POPT_ARG_INT, &option_port, 0,
      "Specify the number of the port to connect to", NULL },
   { "host", 'h', POPT_ARG_STRING, &option_host, 0,
      "Specify the IP address of the host to connect to", NULL },
   { NULL, 0, 0, NULL, 0, NULL, NULL }
};

5.1.8

This is used to keep track of thread-specific information.

struct ThreadStuff
{
   pthread_t *thread;
   int socket;
};

5.1.9

Prototypes for functions declared below.

bool decode_options (int argc, const char **argv);
void prepare_db ();
void populate_format_table ();
Key add_format (string &name, string &viewers);
Key add_document (string &path, Key id_format, string &title);
Key add_document (string &path, string &format, string &title);
Key add_section (Key id_document, string &title, string &ref);
void add_word (Key id_section, string &word);
void close_db ();
void daemonize ();
void wait_for_connections ();
void *process_requests (void *sp);
bool process_command (int s, char c, Key &current_document,
                      Key &current_section);
void search_for_words (vector <SearchResult> &results,
                       const vector <string> &words);
void search_result_append (vector <SearchResult> &results, Key secid);
void sock_send_search_results (int s, const vector <SearchResult> &results);
void handle_signal (int sig);
void syscall_error (const char *msg, int exitcode);   // XXX: noreturn

5.1.10
int
main (int argc, const char **argv)
{
   if (!decode_options (argc, argv))
      return 1;

   pthread_mutex_init (&db_mutex, NULL);

   if (!options_client)
      prepare_db();

   try
   {
      if (option_ping)
      {
         int s = open_connection_to_docd (option_host, option_port);

         sock_send_str (s, "P\n");
         string reply;
         sock_read_string (s, reply);
         printf ("%s\n", reply.c_str());

         close (s);
         return 0;
      }
      else if (option_kill)
      {
         int s = open_connection_to_docd (option_host, option_port);
         sock_send_str (s, "K\n");

         return 0;
      }
   }
   catch (Exception &e)
   {
      fprintf (stderr, "%s: %s\n", progname, e.explain().c_str());
      return 9;
   }

   logfile = fopen (option_logfile_final.c_str(), "w");
   setbuf (logfile, 0);

   if (!option_nodetach)
      daemonize();
   wait_for_connections();

   if (logfile)
      fclose (logfile);

   pthread_mutex_destroy (&db_mutex);
   return 0;
}

5.1.11

Decode the command line options, using the popt library.

bool
decode_options (int argc, const char **argv)
{
   progname = argv[0];
   poptContext context = poptGetContext ("docd", argc, argv, options_table, 0);

   int rc = poptGetNextOpt (context);
   if (rc < -1)
   {
      fprintf (stderr, "%s: error in command line options.\n", progname);
      return false;
   }

   if (option_logfile)
      option_logfile_final = option_logfile;
   else
      option_logfile_final = default_logfile;

   if (option_dbpath)
      option_dbpath_final = option_dbpath;
   else
      option_dbpath_final = default_dbpath;

5.1.12

If the host IP address wasn't specified, set a default.

   if (option_host == 0)
      option_host = x_strdup (inet_ntoa (find_hostip()));

   if (option_ping || option_kill)
      options_client = true;

   return true;
}

5.1.13
void
prepare_db ()
{
   try
   {
      struct stat sbuf;
      bool tables_are_new = false;
      if (stat (option_dbpath_final.c_str(), &sbuf) || !S_ISDIR (sbuf.st_mode))
      {
         tables_are_new = true;

         if (mkdir (option_dbpath_final.c_str(), 0777))
         {
            string msg = "failed to make the database directory `";
            msg += option_dbpath_final;
            msg += "'";
            syscall_error (msg.c_str(), 6);
         }
         fprintf (stderr, "The DB directory `%s' has been made.\n",
                  option_dbpath_final.c_str());

         for (const TableSpec *tabspec = table_specs; tabspec->var; ++tabspec)
         {
            string filename = option_dbpath_final;
            filename += '/';
            filename += tabspec->filename;
            filename += ".table";
            Table::create (filename, tabspec->fields, tabspec->primkey,
                           tabspec->indices);
         }

         fprintf (stderr, "The tables have been created.\n");
      }

      for (const TableSpec *tabspec = table_specs; tabspec->var; ++tabspec)
      {
         string filename = option_dbpath_final;
         filename += '/';
         filename += tabspec->filename;
         filename += ".table";
         *tabspec->var = new Table (filename, false);
      }

      if (tables_are_new)
         populate_format_table();
   }
   catch (Exception &e)
   {
      fprintf (stderr, "%s: %s\n", progname, e.explain().c_str());
      exit (9);
   }
}

5.1.14

Insert information about the standard file formats into the `format' table.

void
populate_format_table ()
{
   FILE *f = fopen ("formats.dat", "r");
   if (f == NULL)
   {
      fprintf (stderr, "Error reading standard file format data: %s",
               strerror (errno));
      exit (6);
   }

   char namebuf[256], iconfile[256], viewersbuf[256];
   while (fscanf (f, " %255s %255s %255s", namebuf, iconfile, viewersbuf) == 3)
   {
      try
      {
         for (char *p = viewersbuf; *p; ++p)
            if (*p == '_')
               *p = ' ';

         TableRecord rec;
         rec.fields.resize (3);
         rec.fields[0].type = 's';
         rec.fields[0].v_string = new string (namebuf);
         rec.fields[1].type = 's';
         rec.fields[1].v_string = new string (iconfile);
         rec.fields[2].type = 's';
         rec.fields[2].v_string = new string (viewersbuf);

         rec.id = 0;
         table_format->insert (rec);
         if (option_verbose)
            fprintf (stderr, "Added format `%s' with ID=%u.\n", namebuf,
                     rec.id);

         delete rec.fields[0].v_string;
         delete rec.fields[1].v_string;
         delete rec.fields[2].v_string;
      }
      catch (Exception &e)
      {
         fprintf (stderr, "Exception: %s.\n", e.explain().c_str());
         exit (1);
      }
   }

   fclose (f);
}

5.1.15
Key
add_format (string &name, string &viewers)
{
   TableRecord rec;
   rec.fields.resize (2);
   rec.fields[0].type = 's';
   rec.fields[0].v_string = &name;
   rec.fields[1].type = 's';
   rec.fields[1].v_string = &viewers;

   rec.id = 0;
   pthread_mutex_lock (&db_mutex);
   table_document->insert (rec);
   pthread_mutex_unlock (&db_mutex);

   return rec.id;
}

5.1.16
Key
add_document (string &path, Key id_format, string &title)
{
   TableRecord rec;
   rec.fields.resize (3);
   rec.fields[0].type = 's';
   rec.fields[0].v_string = &path;
   rec.fields[1].type = 'i';
   rec.fields[1].v_int = id_format;
   rec.fields[2].type = 's';
   rec.fields[2].v_string = &title;

   rec.id = 0;
   pthread_mutex_lock (&db_mutex);
   table_document->insert (rec);
   pthread_mutex_unlock (&db_mutex);

   return rec.id;
}

5.1.17

This function is not thread safe, because it caches the ID of the last used file format using static variables.

This assumes that the name of the format passed in is never the empty string.

Key
add_document (string &path, string &format, string &title)
{
/*   static Key last_format_id = 0;
   static string last_format_name;

   if (format != last_format_name)
   {
      TableField fld;
      fld.type = 's';
      fld.v_string = &format;

      TableRecord rec;
      if (!table_format->find (fld, rec))
         throw ExceptionTable ("document uses unrecognzed format");

      last_format_name = format;
      last_format_id = rec.id;
   }
*/
   return add_document (path, /*last_format_id*/1073741824, title);
}

5.1.18
Key
add_section (Key id_document, string &title, string &ref)
{
   TableRecord rec;
   rec.fields.resize (3);
   rec.fields[0].type = 'i';
   rec.fields[0].v_int = id_document;
   rec.fields[1].type = 's';
   rec.fields[1].v_string = &title;
   rec.fields[2].type = 's';
   rec.fields[2].v_string = &ref;

   rec.id = 0;
   pthread_mutex_lock (&db_mutex);
   table_section->insert (rec);
   pthread_mutex_unlock (&db_mutex);

   return rec.id;
}

5.1.19
void
add_word (Key id_section, string &word)
{

5.1.20

Find out if the word is already known, and get its ID if it is.

   TableRecord rec;
   pthread_mutex_lock (&db_mutex);

   if (table_word->find (word, rec.fields))
   {
      vector <unsigned int> &secv = *rec.fields[0].v_int_a;
      bool duplicate = false, done = false;
      size_t freeindx;

      for (size_t i = 0; !duplicate && i < secv.size(); ++i)
      {
         if (secv[i] == id_section)
            duplicate = true;
         else if (!done && secv[i] == 0)
         {
            done = true;
            freeindx = i;
         }
      }

      if (!duplicate)
      {
         if (done)
            secv[freeindx] = id_section;
         else
         {
            // The array of sections is full, so we make it bigger.
            size_t oldsz = secv.size();
            secv.resize (oldsz + WORD_SECLIST_INC, 0);
            secv[oldsz] = id_section;
         }

         table_word->update (word, rec);

      }

      delete rec.fields[0].v_int_a;
   }
   else
   {
      rec.fields.resize (1);
      rec.fields[0].type = 'I';
      rec.fields[0].v_int_a = new vector <unsigned int> (WORD_SECLIST_INC, 0);
      (*rec.fields[0].v_int_a)[0] = id_section;
      table_word->insert (word, rec);

      delete rec.fields[0].v_int_a;
   }

   pthread_mutex_unlock (&db_mutex);
}

5.1.21
void
close_db ()
{
   pthread_mutex_lock (&db_mutex);

   for (const TableSpec *ts = table_specs; ts->filename; ++ts)
   {
      delete *ts->var;
      *ts->var = 0;
   }

   pthread_mutex_unlock (&db_mutex);
}

5.1.22

Fork the process twice and do other things to ensure that it is detached from any terminal.

void
daemonize ()
{
   pid_t pid = fork();
   if (pid == -1)
      syscall_error ("error doing initial fork", 1);
   else if (pid ≠ 0)
      _exit (0);

   errorfile = logfile;

   if (setsid() == -1)
      syscall_error ("error creating new session", 1);

   pid = fork();
   if (pid == -1)
   {
      syscall_error ("error doing second fork", 1);
      exit (1);
   }
   else if (pid ≠ 0)
      _exit (0);

   if (chdir (option_dbpath_final.c_str()))
      syscall_error ("error changing to database directory", 1);

   umask (0);
   for (int i = 0; i ≤ 2; ++i)
   {
      if (close (i))
         // Hmmm, this may not be such a good idea.
         syscall_error ("error closing file descriptor", 1);
   }
}

5.1.23
void
wait_for_connections ()
{

5.1.24

Open the lock file. If this fails then the daemon is probably already running. This won't work on an NFS file system. An alternative is described in open(2).

/*   int rc = open ("/tmp/docd.lock", O_WRONLY | O_CREATE | O_EXCL, 0644);
   if (rc == -1)
   {
      // TODO: check for other errors.
      *logfile << progname << ": lock file already exists.  Maybe docd is already"
                          " running." << endl;
      exit (2);
   }*/

   if (signal (SIGTERM, handle_signal) == SIG_IGN)
      signal (SIGTERM, SIG_IGN);
   signal (SIGINT, SIG_IGN);
   signal (SIGHUP, SIG_IGN);

5.1.25

Open a socket.

   int s = socket (PF_INET, SOCK_STREAM, 0);
   if (s == -1)
      syscall_error ("error opening socket.", 2);

   sockaddr_in addr;
   addr.sin_family = AF_INET;
   addr.sin_port = htons (option_port);
   addr.sin_addr.s_addr = inet_addr (option_host);
   int rc = bind (s, (sockaddr *) &addr, sizeof (addr));
   if (rc == -1)
      syscall_error ("error binding to port (there may already be a docd"
                     " daemon running): ", 2);
   rc = listen (s, 16);
   assert (rc == 0);

5.1.26

Process connections as they arrive.

   fputs ("Waiting for messages.\n", logfile);
   while (!finish_processing)
   {
      unsigned int client_addrlen;
      rc = accept (s, (sockaddr *) &addr, &client_addrlen);
      if (rc == -1)
         fputs ("Error accepting connection.\n", logfile);
      else
      {
         fprintf (logfile, "Recieved connection on socket `%d'.\n", rc);
         ThreadStuff *stuff = new ThreadStuff;
         stuff->thread = new pthread_t;
         stuff->socket = rc;
         if (pthread_create (stuff->thread, NULL, process_requests, stuff))
         {
            fputs ("Error creating new thread.", logfile);
            exit (1);
         }
      }
   }
}

5.1.27

Process requests recieved on a connection until it is closed.

void *
process_requests (void *sp)
{
   ThreadStuff *ts = (ThreadStuff *) sp;
   int s = ts->socket;
   char c;
   bool cont = true;
   Key current_document = 0, current_section = 0;

   while (cont)
   {
      int rc = recv (s, &c, 1, 0);
      if (rc < 0)
      {
         fprintf (logfile, "Error recieving a character from client: %s\n",
                  strerror (errno));
         goto stuffit;
      }
      else if (rc == 0 || c == 4)   // Detect EOF or Ctrl-D.
         goto stuffit;

      try
      {
         cont = process_command (s, c, current_document, current_section);
      }
      catch (Exception &e)
      {
         fprintf (logfile, "Exception: %s\n", e.explain().c_str());
      }
   }

stuffit:
   fputs ("Closing connection to client.\n", logfile);
   close (s);
   delete ts->thread;
   delete ts;
   pthread_exit (0);

   return 0;      // This won't happen.
}

5.1.28

Deal with an individual message from the client. Returns `true' if the connection should remain open for more requests.

bool
process_command (int s, char c, Key &current_document, Key &current_section)
{
   string buf, title, path, format;

   switch (c)
   {
    case ' ': case '\n': case '\r': case '\t':
      break;

    case '?':
      sock_send_str (s, "This is docd, version " VERSION ".\n"
                   "The following commands are available:\n"
                   "   d - add a document\n"
                   "   e - indicates the end of information about a document\n"
                   "   K - shut down the server\n"
                   "   P - ping (just replies with `OK')\n"
                   "   Q - close connection\n"
                   "   s - add a section heading to the current document\n"
                   "   S - sync the database with the files it is stored in\n"
                   "   w - add a word to the current document\n");
      break;

    case '#':
      buf = "";
      sock_read_string (s, buf);
      break;

    case 'c':
      buf = "";
      sock_read_string (s, buf);

      {
         vector <string> words;
         split (words, buf, ' ');

         vector <SearchResult> results;
         search_for_words (results, words);
         sock_send_search_results (s, results);
      }
      break;

    case 'd':
      path = title = buf = format = "";
      sock_read_string (s, path);
      sock_read_string (s, title);
      sock_read_string (s, buf);
      sock_read_string (s, buf);
      sock_read_string (s, format);

      if (current_document)
         fprintf (logfile, "Tried to start a new document in another one.\n");
      else
      {
         // TODO: check that we don't already have it.
         assert (current_section == 0);
         current_document = add_document (path, format, title);

         if (option_verbose)
            fprintf (logfile, "Added document `%s', ID=%u.\n",
                     path.c_str(), current_document);
      }
      break;

    case 'e':
      if (current_document)
         current_document = current_section = 0;
      else
         fprintf (logfile, "Tried to end document without starting one.\n");
      break;

    case 'Q':
      sock_send_str (s, "OK - closing your connection\n");
      return false;

    case 'K':
      sock_send_str (s, "OK - shutting the server down\n");
      close (s);
      close_db();
      fclose (logfile);
      exit (0);

    case 'P':
      sock_send_str (s, "OK\n");
      break;

    case 's':
      title = buf = "";
      sock_read_string (s, title);
      sock_read_string (s, buf);    // Reference.

      if (current_document)
      {
         current_section = add_section (current_document, title, buf);

         if (option_verbose)
            fprintf (logfile, "Added section `%s', ID=%u.\n",
                     title.c_str(), current_section);
      }
      else
         fprintf (logfile, "Tried to add section outside a document.\n");
      break;

    case 'S':
      for (const TableSpec *ts = table_specs; ts->filename; ++ts)
         (*ts->var)->sync();
      break;

    case 'w':
      buf = "";
      sock_read_string (s, buf);

      if (current_document)
      {
         if (current_section == 0)
         {
            string foo = "";
            current_section = add_section (current_document, foo, foo);

            if (option_verbose)
               fprintf (logfile, "Added default section `%s', ID=%u.\n",
                        title.c_str(), current_section);
         }

         add_word (current_section, buf);

         if (option_verbose)
            fprintf (logfile, "Added word `%s'.\n", buf.c_str());
      }
      else
         fprintf (logfile, "Error, word `%s' written outside document.\n",
                  buf.c_str());
      break;

    default:
      string err = "Error: command `";
      err += c;
      err += "' unrecognized.\n";
      sock_send_str (s, err);
   }

   return true;
}

5.1.29
void
search_for_words (vector <SearchResult> &results, const vector <string> &words)
{
   results.clear();
   if (words.empty())
      return;

   vector <string> lcase (words.size());
   for (size_t i = 0; i < words.size(); ++i)
      for (size_t j = 0; j < words[i].size(); ++j)
         lcase[i] += (char) tolower (words[i][j]);

5.1.30

Find the set of sections which match each word individually.

   vector <vector <Key> > sections_per_word (lcase.size());
   pthread_mutex_lock (&db_mutex);
   for (size_t i = 0; i < lcase.size(); ++i)
   {
      vector <TableField> flds;
      if (!table_word->find (lcase[i], flds))
      {
         pthread_mutex_unlock (&db_mutex);
         return;
      }

      assert (flds[0].type == 'I');
      size_t sz = flds[0].v_int_a->size();
      sections_per_word[i].resize (sz);
      for (size_t j = 0; j < sz; ++j)
         sections_per_word[i][j] = (*flds[0].v_int_a)[j];

      delete flds[0].v_int_a;
   }

5.1.31

Combine the sets with an intersection.

//   for (size_t i = 0; i < lcase.size(); ++i)
//      sort (sections_per_word[i].begin(), sections_per_word[i].end());

   for (size_t i = 0; i < sections_per_word[0].size(); ++i)
   {
      Key secid = sections_per_word[0][i];

      bool missing = false;
      for (size_t j = 1; !missing && j < lcase.size(); ++j)
      {
         bool found = false;
         for (size_t k = 0; !found && k < sections_per_word[j].size(); ++k)
            found = (sections_per_word[j][k] == secid);

         missing = !found;
      }

      if (!missing)
         search_result_append (results, secid);
   }

   pthread_mutex_unlock (&db_mutex);
}

5.1.32
void
search_result_append (vector <SearchResult> &results, Key secid)
{
   vector <TableField> flds;

   if (table_section->find (secid, flds))
   {
      size_t si = results.size();
      results.resize (si + 1);

      Key docid = flds[0].v_int;
      results[si].section_title = *flds[1].v_string;
      results[si].section_ref = *flds[2].v_string;

      delete flds[1].v_string;
      delete flds[2].v_string;

      if (table_document->find (docid, flds))
      {
         Key fmtid = flds[1].v_int;
         results[si].document_filename = *flds[0].v_string;
         results[si].document_title = *flds[2].v_string;

         delete flds[0].v_string;
         delete flds[2].v_string;

         if (table_format->find (fmtid, flds))
         {
            results[si].format_name = *flds[0].v_string;
            results[si].format_icon = *flds[1].v_string;
            results[si].format_viewers = *flds[2].v_string;

            delete flds[0].v_string;
            delete flds[1].v_string;
            delete flds[2].v_string;
         }
         else
            fprintf (logfile, "Format `%u' not found in search.\n",
                     (unsigned int) fmtid);
      }
      else
         fprintf (logfile, "Document `%u' not found in search.\n",
                  (unsigned int) docid);
   }
   else
      fprintf (logfile, "Section `%u' not found in `search_result_append'.\n",
               (unsigned int) secid);
}

5.1.33
void
sock_send_search_results (int s, const vector <SearchResult> &results)
{
   char buf[32];
   sprintf (buf, "%i\n", results.size());
   sock_send_str (s, buf);

   for (size_t i = 0; i < results.size(); ++i)
   {
      sock_send_str_nl (s, results[i].document_filename);
      sock_send_str_nl (s, results[i].document_title);
      sock_send_str_nl (s, results[i].section_title);
      sock_send_str_nl (s, results[i].section_ref);
      sock_send_str_nl (s, results[i].format_icon);
      sock_send_str_nl (s, results[i].format_name);
      sock_send_str_nl (s, results[i].format_viewers);
   }
}

5.1.34
void
handle_signal (int sig)
{
   if (logfile)
      fprintf (logfile, "Recieved signal number `%d'.\n", sig);

   finish_processing = true;
   signal (sig, handle_signal);     // ???
}

5.1.35
void
syscall_error (const char *msg, int exitcode)
{
   fprintf (stderr, "%s: %s: %s\n", progname, msg, strerror (errno));
   exit (exitcode);
}