/* #includes */ /*{{{C}}}*//*{{{*/
#include "config.h"

#include <sys/types.h>
#include <assert.h>
#include <ctype.h>
#include <errno.h>
#ifdef HAVE_GETTEXT
#include <libintl.h>
#define _(String) gettext(String)
#else
#define _(String) String
#endif
#include <locale.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#include "getopt.h"
/*}}}*/
/* #defines */ /*{{{*/
#define ISALPHA(c) ((c>='a' && c<='z') || (c>='A' && c<='Z'))
/*}}}*/

/* types */ /*{{{*/
struct Url
{
  char *url;
  int number;
  struct Url *next;
};
/*}}}*/
/* variables */ /*{{{*/
static int utf8term;
static int intitle,inlist,inheader,inpre,inword,inscript,inxml,instyle;
static int words,skipheaders,skiplists,pretty,skipurls;
static const char *file;
static int line;
/*}}}*/

static void condputchar(int c) /*{{{*/
{
  static int nls=2;
  static int trailingwhite=0;

  assert(c>=0);
  if (words
      ||
      (
        !(skiplists && inlist)
        && !inscript
        && !inxml
        && !instyle
        && !(skipheaders && !(inheader || intitle))
      )
     )
  {
    if (pretty)
    {
      if (c=='\n')
      {
        trailingwhite=0;
        if (++nls>2) return;
      }
      else if (c==' ' || c=='\t')
      {
        ++trailingwhite;
        return;
      }
      else
      {
        nls=0;
      }
      while (trailingwhite)
      {
        --trailingwhite;
        putchar(' ');
      }
      putchar(c);
    }
    else putchar(c);
  }
  else if (!pretty && c=='\n') putchar(c);
}
/*}}}*/
static void wordputchar(int c) /*{{{*/
{
  assert(c>=0);
  if (words)
  {
    static int first;

    if (isalpha(c) || c=='_')
    {
      switch (inword)
      {
        case 0: inword=1; first=c; break;
        case 1: inword=2; condputchar(first); condputchar(c); break;
        case 2: condputchar(c); break;
      }
    }
    else if (inword) { if (inword==2) condputchar('\n'); inword=0; }
  }
  else condputchar(c);
}
/*}}}*/
static void wordpututf8(int c) /*{{{*/
{
  assert(c>=0);
  if (utf8term)
  {
    if (c <= 0x7f)
    {
      wordputchar(c);
    }
    else if (c <= 0x7ff)
    {
      wordputchar(0xc0 | (c >> 6));
      wordputchar(0x80 | (c & 0x3f));
    }
    else if (c <= 0xffff)
    {
      wordputchar(0xe0 | (c >> 12));
      wordputchar(0x80 | ((c >> 6) & 0x3f));
      wordputchar(0x80 | (c & 0x3f));
    }
    else if (c <= 0x10ffff)
    {
      wordputchar(0xf0 | (c >> 18));
      wordputchar(0x80 | ((c >> 12) & 0x3f));
      wordputchar(0x80 | ((c >> 6) & 0x3f));
      wordputchar(0x80 | (c & 0x3f));
    }
  }
  else wordputchar(c);
}
/*}}}*/
static void entitychar(FILE *fp, int c) /*{{{*/
{
  char entity[73];
  int i=0;

  assert(c>=0);
  if (c!='&')
  {
    wordputchar(c);
    return;
  }

  if ((c=getc(fp))=='#')
  {
    c=getc(fp);
    if (isdigit(c))
    {
      int numeric=c-'0';

      while ((c=getc(fp))!=EOF && isdigit(c))
      {
        numeric=numeric*10+(c-'0');
      }
      wordpututf8(numeric);
      if (c!=';') wordputchar(c);
    }
    else if (toupper(c)=='X')
    {
      int numeric=0;

      while ((c=getc(fp))!=EOF && isxdigit(c))
      {
        numeric=numeric*16+(isdigit(c) ? c-'0' : toupper(c)-'A'+10);
      }
      wordpututf8(numeric);
      if (c!=';') wordputchar(c);
    }
    else
    {
      wordputchar('&');
      wordputchar('#');
    }
  }
  else if (ISALPHA(c) || isdigit(c) || c=='.' || c=='-')
  {
    /* variables */ /*{{{*/
    static struct
    {
      const char *name;
      char value;
      unsigned char utf8value[4];
    }
    const *eptr,
    entities[]=
    {
      { "gt",     '>', { '>' } },
      { "lt",     '<', { '<' } },
      { "amp",    '&', { '&' } },
      { "quot",   '"', { '"' } },
      { "ndash",  '-', { '-' } },
      { "shy",    '\0' },
      { "AElig",  '', { (char)0xc3, (char)0x86 } },
      { "Aacute", '', { (char)0xc3, (char)0x81 } },
      { "Acirc",  '', { (char)0xc3, (char)0x82 } },
      { "Agrave", '', { (char)0xc3, (char)0x80 } },
      { "Aring",  '', { (char)0xc3, (char)0x85 } },
      { "Atilde", '', { (char)0xc3, (char)0x83 } },
      { "Auml",   '', { (char)0xc3, (char)0x84 } },
      { "Ccedil", '', { (char)0xc3, (char)0x87 } },
      { "ETH",    '', { (char)0xc3, (char)0x90 } },
      { "Eacute", '', { (char)0xc3, (char)0x89 } },
      { "Ecirc",  '', { (char)0xc3, (char)0x8a } },
      { "Egrave", '', { (char)0xc3, (char)0x88 } },
      { "Euml",   '', { (char)0xc3, (char)0x8b } },
      { "Iacute", '', { (char)0xc3, (char)0x8d } },
      { "Icirc",  '', { (char)0xc3, (char)0x8e } },
      { "Igrave", '', { (char)0xc3, (char)0x8c } },
      { "Iuml",   '', { (char)0xc3, (char)0x8f } },
      { "Ntilde", '', { (char)0xc3, (char)0xb1 } },
      { "Oacute", '', { (char)0xc3, (char)0xb3 } },
      { "Ocirc",  '', { (char)0xc3, (char)0xb4 } },
      { "Ograve", '', { (char)0xc3, (char)0xb2 } },
      { "Oslash", '', { (char)0xc3, (char)0xb8 } },
      { "Otilde", '', { (char)0xc3, (char)0xb5 } },
      { "Ouml",   '', { (char)0xc3, (char)0xb6 } },
      { "THORN",  '', { (char)0xc3, (char)0xbe } },
      { "Uacute", '', { (char)0xc3, (char)0xba } },
      { "Ucirc",  '', { (char)0xc3, (char)0xbb } },
      { "Ugrave", '', { (char)0xc3, (char)0xb9 } },
      { "Uuml",   '', { (char)0xc3, (char)0xbc } },
      { "Yacute", '', { (char)0xc3, (char)0xbd } },
      { "aacute", '', { (char)0xc3, (char)0xa1 } },
      { "acirc",  '', { (char)0xc3, (char)0xa2 } },
      { "aelig",  '', { (char)0xc3, (char)0xa6 } },
      { "agrave", '', { (char)0xc3, (char)0xa0 } },
      { "aring",  '', { (char)0xc3, (char)0xa5 } },
      { "atilde", '', { (char)0xc3, (char)0xa3 } },
      { "auml",   '', { (char)0xc3, (char)0xa4 } },
      { "ccedil", '', { (char)0xc3, (char)0xa7 } },
      { "eacute", '', { (char)0xc3, (char)0xa9 } },
      { "ecirc",  '', { (char)0xc3, (char)0xaa } },
      { "egrave", '', { (char)0xc3, (char)0xa8 } },
      { "eth",    '', { (char)0xc3, (char)0xb0 } },
      { "euml",   '', { (char)0xc3, (char)0xab } },
      { "iacute", '', { (char)0xc3, (char)0xad } },
      { "icirc",  '', { (char)0xc3, (char)0xae } },
      { "igrave", '', { (char)0xc3, (char)0xac } },
      { "iuml",   '', { (char)0xc3, (char)0xaf } },
      { "nbsp",   '', { (char)0xc2, (char)0xa0 } },
      { "ntilde", '', { (char)0xc3, (char)0xb1 } },
      { "oacute", '', { (char)0xc3, (char)0xb3 } },
      { "ocirc",  '', { (char)0xc3, (char)0xb4 } },
      { "ograve", '', { (char)0xc3, (char)0xb2 } },
      { "oslash", '', { (char)0xc3, (char)0xb8 } },
      { "otilde", '', { (char)0xc3, (char)0xb5 } },
      { "ouml",   '', { (char)0xc3, (char)0xb6 } },
      { "szlig",  '', { (char)0xc3, (char)0x9f } },
      { "thorn",  '', { (char)0xc3, (char)0xbe } },
      { "uacute", '', { (char)0xc3, (char)0xba } },
      { "ucirc",  '', { (char)0xc3, (char)0xbb } },
      { "ugrave", '', { (char)0xc3, (char)0xb9 } },
      { "uuml",   '', { (char)0xc3, (char)0xbc } },
      { "yacute", '', { (char)0xc3, (char)0xbd } },
      { "yuml",   '', { (char)0xc3, (char)0xbf } },
      { "zwj",     '\0' },
      { "zwnj",    '\0' },
    };
    /*}}}*/

    entity[i++]=(char)c;
    while ((c=getc(fp))!=EOF && (ISALPHA(c) || isdigit(c) || c=='.' || c=='-'))
    {
      if (i<sizeof(entity)-1) entity[i++]=(char)c;
    }
    entity[i]='\0';
    for (eptr=entities; eptr<entities+sizeof(entities)/sizeof(entities[0]); ++eptr)
    {
      if (strcmp(eptr->name,entity)==0)
      {
        if (utf8term)
        {
          if (eptr->utf8value[0]) wordputchar(eptr->utf8value[0]);
          if (eptr->utf8value[1]) wordputchar(eptr->utf8value[1]);
          if (eptr->utf8value[2]) wordputchar(eptr->utf8value[2]);
          if (eptr->utf8value[3]) wordputchar(eptr->utf8value[3]);
        }
        else if (eptr->value) wordputchar(eptr->value);
        if (c!=';') wordputchar(c);
        return;
      }
      else if (strcmp(entity,"hellip")==0)
      {
        if (utf8term)
        {
          wordputchar(0xe2);
          wordputchar(0x80);
          wordputchar(0xa6);
        }
        else
        {
          wordputchar('.');
          wordputchar('.');
          wordputchar('.');
        }
        return;
      }
    }
    wordputchar('&');
    for (i=0; entity[i]; ++i) wordputchar(entity[i]);
    wordputchar(c);
  }
  else
  {
    wordputchar('&');
    wordputchar(c);
  }
}
/*}}}*/
static void dehtml(FILE *fp, const char *fileName) /*{{{*/
{
  int c;
  char href[1024];
  struct Url *urls,**lasturl;

  line=1;
  file=fileName;
  intitle=inlist=inheader=inpre=0;
  href[0]='\0';
  urls=(struct Url*)0;
  lasturl=&urls;
  while ((c=getc(fp))!=EOF)
  {
    if (c=='<') /* tag */ /*{{{*/
    {
      char tag[80];
      char attribute[80];
      int i;

      if (words) wordputchar(' ');
      /* tag name */ /*{{{*/
      i=0;
      while ((c=getc(fp))!=EOF && c!='>' && c!=' ' && c!='\n')
      {
        if (i<sizeof(tag)-1) tag[i++]=(char)tolower(c);
      }
      tag[i]='\0';
      if (c=='\n')
      {
        ++line;
        wordputchar('\n');
      }
      if (i && i<sizeof(tag))
      {
        if (strcmp(tag,"p")==0 || strcmp(tag,"hr")==0) /*{{{*/
        {
          if (!words && pretty) { wordputchar('\n'); wordputchar('\n'); }
        }
        /*}}}*/
        else if (strcmp(tag,"br")==0) /*{{{*/
        {
          if (!words && pretty) wordputchar('\n');
        }
        /*}}}*/
        else if (strcmp(tag,"title")==0) intitle=1;
        else if (strcmp(tag,"/title")==0) intitle=0;
        else if (tolower(tag[0])=='h' && isdigit(tag[1]) && tag[2]=='\0') /*{{{*/
        {
          if (!words && pretty)
          {
            wordputchar('\n');
            wordputchar('\n');
          }
          ++inheader;
        }
        /*}}}*/
        else if (tag[0]=='/' && tolower(tag[1])=='h' && isdigit(tag[2]) && tag[3]=='\0') /*{{{*/
        {
          if (!words && pretty)
          {
            wordputchar('\n');
            wordputchar('\n');
          }
          if (inheader) --inheader;
        }
        /*}}}*/
        else if (strcmp(tag,"pre")==0) inpre=1;
        else if (strcmp(tag,"/pre")==0) inpre=0;
        else if (strcmp(tag,"dl")==0) ++inlist;
        else if (strcmp(tag,"/dl")==0) { if (inlist) --inlist; }
        else if (strcmp(tag,"ul")==0) ++inlist;
        else if (strcmp(tag,"/ul")==0) { if (inlist) --inlist; }
        else if (strcmp(tag,"ol")==0) ++inlist;
        else if (strcmp(tag,"/ol")==0) { if (inlist) --inlist; }
        else if (strcmp(tag,"/a")==0 && href[0] && !skipurls && !words) /*{{{*/
        {
          struct Url *u;
          char n[32],*s;
          int number=0;

          for (u=urls; u && strcmp(u->url,href); u=u->next) number=u->number;
          if (u==(struct Url*)0)
          {
            u=malloc(sizeof(struct Url));
            u->number=number+1;
            u->url=strcpy(malloc(strlen(href)+1),href);
            u->next=(struct Url*)0;
            *lasturl=u;
            lasturl=&u->next;
          }
          snprintf(n,sizeof(n)," [%d]",u->number);
          for (s=n; *s; ++s) wordputchar(*s);
          href[0]='\0';
        }
        /*}}}*/
        else if (strcmp(tag,"script")==0) inscript=1;
        else if (strcmp(tag,"/script")==0) inscript=0;
        else if (strcmp(tag,"style")==0) instyle=1;
        else if (strcmp(tag,"/style")==0) instyle=0;
        else if (strcmp(tag,"xml")==0) inxml=1;
        else if (strcmp(tag,"/xml")==0) inxml=0;
      }
      /*}}}*/
      if (c!=EOF && c!='>') /* tag attributes */ /*{{{*/
      {
        enum { EMPTY, ATTRIBUTE, EQ, VALUE, DOUBLEQUOTEDVALUE, SINGLEQUOTEDVALUE } state=EMPTY;
        int output_value=0;
        int a_href=0;

        do
        {
          c=getc(fp);
          if (c=='\n')
          {
            ++line;
            wordputchar('\n');
          }
          switch (state)
          {
            case EMPTY: /*{{{*/
            {
              if (ISALPHA(c))
              {
                state=ATTRIBUTE;
                i=0;
                assert(c>=0);
                attribute[i++]=(char)c;
              }
              break;
            }
            /*}}}*/
            case ATTRIBUTE: /*{{{*/
            {
              if (ISALPHA(c))
              {
                assert(c>=0);
                if (i<sizeof(attribute)-1) attribute[i++]=(char)tolower(c);
              }
              else
              {
                attribute[i]='\0';
                if (c=='=')
                {
                  state=EQ;
                  a_href=(strcmp(tag,"a")==0) && (strcmp(attribute,"href")==0);
                  output_value=(strcmp(tag,"img")==0) && (strcmp(attribute,"alt")==0);
                }
                else state=EMPTY;
              }
              break;
            }
            /*}}}*/
            case EQ: /*{{{*/
            {
              i=0;
              if (c=='"') state=DOUBLEQUOTEDVALUE;
              else if (c=='\'') state=SINGLEQUOTEDVALUE;
              else
              {
                state=VALUE;
                if (a_href)
                {
                  assert(c>=0);
                  if (i<sizeof(href)-1) href[i++]=(char)c;
                }
                else if (output_value) entitychar(fp,c);
              }
              break;
            }
            /*}}}*/
            case DOUBLEQUOTEDVALUE: /*{{{*/
            {
              if (c=='"')
              {
                if (a_href)
                {
                  href[i]='\0';
                  a_href=0;
                }
                output_value=0;
                if (words) wordputchar('\n');
                state=EMPTY;
              }
              else if (a_href)
              {
                assert(c>=0);
                if (i<sizeof(href)-1) href[i++]=(char)c;
              }
              else if (output_value) entitychar(fp,c);
              break;
            }
            /*}}}*/
            case SINGLEQUOTEDVALUE: /*{{{*/
            {
              if (c=='\'')
              {
                if (a_href)
                {
                  href[i]='\0';
                  a_href=0;
                }
                output_value=0;
                if (words) wordputchar('\n');
                state=EMPTY;
              }
              else if (a_href)
              {
                assert(c>=0);
                if (i<sizeof(href)-1) href[i++]=(char)c;
              }
              else if (output_value) entitychar(fp,c);
              break;
            }
            /*}}}*/
            case VALUE: /*{{{*/
            {
              if (c==' ' || c=='>' || c=='"' || c=='\'')
              {
                if (a_href)
                {
                  a_href=0;
                  href[i]='\0';
                }
                output_value=0;
                state=EMPTY;
              }
              else if (a_href)
              {
                assert(c>=0);
                if (i<sizeof(href)-1) href[i++]=(char)c;
              }
              else if (output_value) entitychar(fp,c);
              break;
            }
            /*}}}*/
          }
        } while (c!=EOF && (output_value || c!='>'));
      }
      /*}}}*/
    }
    /*}}}*/
    else if (c=='&') /* entity */ /*{{{*/
    {
      entitychar(fp,c);
    }
    /*}}}*/
    else if (c=='\n') /* new line */ /*{{{*/
    {
      ++line;
      wordputchar(c);
    }
    /*}}}*/
    else wordputchar(c);
  }
  wordputchar('\n');
  while (urls)
  {
    char n[32],*s;
    struct Url *f;

    snprintf(n,sizeof(n),"[%d] ",urls->number);
    for (s=n; *s; ++s) wordputchar(*s);
    for (s=urls->url; *s; ++s) wordputchar(*(unsigned char*)s);
    wordputchar('\n');
    free(urls->url);
    f=urls;
    urls=urls->next;
    free(f);
  }
}
/*}}}*/

int main(int argc, char *argv[]) /*{{{*/
{
  /* variable declarations */ /*{{{*/
  char const *ctype;
  FILE *in;
  int usage=0;
  int c;
  static struct option lopts[]=
  {
    { "word-list", no_argument, 0, 'w' },
    { "skip-headers", no_argument, 0, 's' },
    { "skip-lists", no_argument, 0, 'l' },
    { "prettyprint", no_argument, 0, 'p' },
    { "urls", no_argument, 0, 'u' },
    { "help", no_argument, 0, 'h' },
    { "version", no_argument, 0, 'v' },
    { (const char*)0, 0, 0, '\0' }
  };
  /*}}}*/

  setlocale(LC_MESSAGES,"");

  ctype=setlocale(LC_CTYPE,"");
  if (ctype) utf8term=!!strstr(ctype,"UTF-8");
#ifdef HAVE_GETTEXT
  bindtextdomain("dehtml",LOCALEDIR);
  textdomain("dehtml");
#endif
  /* parse arguments */ /*{{{*/
  while ((c=getopt_long(argc,argv,"wslpu?h",lopts,(int*)0))!=EOF) switch(c)
  {
    case 'w': words=1; break;
    case 's': skipheaders=1; break;
    case 'l': skiplists=1; break;
    case 'p': pretty=1; break;
    case 'u': skipurls=1; break;
    case 'h': usage=2; break;
    case 'v': printf("dehtml " VERSION "\n"); exit(0);
    default: usage=1;
  }
  if (usage==1)
  {
    fprintf(stderr,_("Usage: dehtml [-w] [-s] [-l] [-p] [-u] [file ...]\n"));
    fprintf(stderr,"\n");
    fprintf(stderr,_("Try `dehtml -h' or `dehtml --help' for more information.\n"));
    exit(1);
  }
  if (usage==2)
  {
    fprintf(stderr,_("Usage: dehtml [-w] [-s] [-l] [-p] [-u] [file ...]\n"));
    fprintf(stderr,"\n");
    fprintf(stderr,_("Remove HTML constructs from documents.\n"));
    fprintf(stderr,"\n");
    fprintf(stderr,_("-w, --word-list     output a word list\n"));
    fprintf(stderr,_("-s, --skip-headers  do not output headers\n"));
    fprintf(stderr,_("-l, --skip-lists    do not output lists\n"));
    fprintf(stderr,_("-p, --pretty-print  pretty printed output\n"));
    fprintf(stderr,_("-u, --skip-urls     Do not include indexed URLs\n"));
    fprintf(stderr,_("-h, --help          display this help and exit\n"));
    fprintf(stderr,_("    --version       display version and exit\n"));
    fprintf(stderr,"\n");
    fprintf(stderr,_("Report bugs to <michael@moria.de>.\n"));
    exit(0);
  }
  /*}}}*/
  /* dehtml stdin or files, if any */ /*{{{*/
  if (optind<argc) while (optind<argc)
  {
    if ((in=fopen(argv[optind],"r"))==(FILE*)0)
    {
      fprintf(stderr,_("dehtml: Opening `%s' failed (%s).\n"),argv[optind],strerror(errno));
      exit(1);
    }
    dehtml(in,argv[optind]);
    fclose(in);
    ++optind;
  }
  else dehtml(stdin,(const char*)0);
  if (fclose(stdout)==-1)
  {
    fprintf(stderr,_("dehtml: Closing standard output failed (%s).\n"),strerror(errno));
    return 1;
  }
  /*}}}*/
  return 0;
}
/*}}}*/
