/*====================================+=====================================+
! File CFileInit_html.cpp             ! Copyright (C) 2002-2013 Remi PASCAL !
+-------------------------------------+-------------------------------------+
! This file is part of Siren.                                               !
! Siren is free software: you can redistribute it and/or modify it under    !
! the terms of the GNU General Public License as published by the Free      !
! Software Foundation, either version 3 of the License, or any later        !
! version.                                                                  !
! Siren is distributed in the hope that it will be useful, but WITHOUT ANY  !
! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS !
! FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more    !
! details.                                                                  !
! You should have received a copy of the GNU General Public License along   !
! with Siren. If not, see <http://www.gnu.org/licenses/>.                   !
+---------------------------------------------------------------------------+
!                                                                           !
!                 Web Page Source: ".html", ".htm"                          !
!                                                                           !
+-------+-------------------------------------------------------------------+
! Notes !                                                                   !
+-------+                                                                   !
! It uses a very simplistic html "analyser" based on regular expressions    !
!                                                                           !
+==========================================================================*/



/*-------------------------------------------------------------------------*/
#include <wx/html/htmlpars.h>
#include <wx/regex.h>
#include "common/sr_lib.h"
#include "CFileInit.h"
#include "CFile.h"
/*-------------------------------------------------------------------------*/



/*--------------------------------------------------------------------------+
! Data/Treatment used during the loading                                    !
+--------------------------------------------------------------------------*/
class CFileInit_html : public CFileInit_type_base
{
   /*----------------------------------------------------------------------*/
   public :
      /*-------------------------------------------------------------------*/
      CFileInit_html( CFileInit &parent ) : CFileInit_type_base( parent )
      {  ; }
      /*-------------------------------------------------------------------*/
      int tag_regex( const wxString &s_buffer,
                     const char * const co_p_c_regex, int i_col
                   ) ;
      int html_read() ;
   /*----------------------------------------------------------------------*/
} ;

/*--------------------------------------------------------------------------+
! This function "decodes" a html text string: A&eacute;B => A&B ... etc ... !
! First I created my own function when I saw this class defined in WX HTML. !
! It is "undocumented", so it may "disappear" in the future ...             !
+--------------------------------------------------------------------------*/
static wxString st_html_conv_html( const wxString &s )
{
   /*----------------------------------------------------------------------*/
   return( wxHtmlEntitiesParser().Parse( s ) ) ;
   /*----------------------------------------------------------------------*/
}

/*--------------------------------------------------------------------------+
! Extraction of a string quoted (or not) with '"' or '''.                   !
! If it is not "quoted" it ends on a non alphanumeric (+sep) char           !
+--------------------------------------------------------------------------*/
static wxString st_html_extr_meta_string( const wxString &s_in )
{
   /*----------------------------------------------------------------------*/
   if( s_in.empty() ) { return( wxEmptyString ) ; }
   /*----------------------------------------------------------------------*/
   sr::wxString_cit it_from = s_in.begin() ;
   sr::wxString_cit it_to                  ;
   wxUniChar        c_quote = '\0'         ; // By default, no quoting

   /*--( Jump over the "empty" information )-------------------------------*/
   sr::next_non_space( s_in.end(), it_from ) ;
   /*----------------------------------------------------------------------*/
   if( it_from == s_in.end() ) { return( wxEmptyString ) ; }

   /*--( Stop later on the current quoting character )---------------------*/
   if( *it_from == '"' || *it_from == '\'' )
   {  c_quote = *it_from++ ; }
   /*----------------------------------------------------------------------*/
   it_to = it_from ;
   /*----------------------------------------------------------------------*/
   while(    it_to != s_in.end()
          && ( c_quote == '\0' || *it_to != c_quote )
          && (    c_quote != '\0'
               || sr::bool_isalnum( sr::unaccent( *it_to ) )
               || *it_to == '.'
               || *it_to == '-'
               || *it_to == '_'
             )
        )
   {  ++it_to ; }
   /*----------------------------------------------------------------------*/
   return( wxString( it_from, it_to ) ) ;
   /*----------------------------------------------------------------------*/
}

/*-------------------------------------------------------------------------*/
static int st_html_conv_tag_col( const wxString &s_var )
{
   /*----------------------------------------------------------------------*/
   wxString s_var_reworked( s_var ) ;

   /*----------------------------------------------------------------------*/
   s_var_reworked.MakeLower() ;

   /*--( Authorize the "dc." prefix )--------------------------------------*/
   if( s_var_reworked.StartsWith( "dc." ) )
   {  s_var_reworked.Remove( 0, 3 ) ; }
   /*----------------------------------------------------------------------*/
   if( s_var_reworked == "keywords" )
   {  return( COL_DOC_KEYWORDS ) ; }
   /*----------------------------------------------------------------------*/
   if( s_var_reworked == "description" )
   {  return( COL_DOC_COMMENT ) ; }
   /*----------------------------------------------------------------------*/
   if( s_var_reworked == "subject" )
   {  return( COL_DOC_SUBJECT ) ; }
   /*----------------------------------------------------------------------*/
   if(    s_var_reworked == "author"
       || s_var_reworked == "creator"
     )
   {  return( COL_DOC_AUTHOR ) ; }
   /*----------------------------------------------------------------------*/
   if( s_var_reworked ==  "date-creation-yyyymmdd" )
   {  return( COL_DOC_CRE_DATE ) ; }
   if( s_var_reworked ==  "date-revision-yyyymmdd" )
   {  return( COL_DOC_MOD_DATE ) ; }
   /*----------------------------------------------------------------------*/
   if( s_var_reworked ==  "generator" )
   {  return( COL_DOC_APPLICATION ) ; }
   /*----------------------------------------------------------------------*/
   return( COL_NB ) ;
   /*----------------------------------------------------------------------*/
}

/*--------------------------------------------------------------------------+
! Regular expressions used to parse the data ...                            !
+--------------------------------------------------------------------------*/

/*-------------------------------------------------------------------------*/
static const char *st_co_p_c_regex_charset
= "<\\s*?meta\\W.*?http-equiv\\s*=\\s*\"?Content-Type\"?.*?"
  "charset\\s*=\\s*\"?([-_\\.A-Za-z0-9]*).*?>" ;

/*--( The tag title can contain attributes )-------------------------------*/
static const char *st_co_p_c_regex_title
= "<\\s*?title(?:\\s*>|\\W.*?>)\\s*(.*?)\\s*<\\s*/\\s*title\\s*?>" ;

/*--( On clubic.com, no "href" in the base tag )---------------------------*/
static const char *st_co_p_c_regex_base
= "<\\s*?base\\s[^<]*?href\\s*=\\s*\"(.*?)\"[^<]*?>" ;

/*--( For the meta, start and end will be used as limits )-----------------*/
static const char *st_co_p_c_regex_meta
= "<\\s*?meta\\W.*?>" ;

static const char *st_co_p_c_regex_name
= "\\Wname\\s*?=\\s*?" ;

static const char *st_co_p_c_regex_content
= "\\Wcontent\\s*?=\\s*?" ;

static const int st_co_i_regex_flags = wxRE_ADVANCED | wxRE_ICASE ;


/*-------------------------------------------------------------------------*/
static wxString st_html_get_charset( const wxString &s_buffer )
{
   /*----------------------------------------------------------------------*/
   wxRegEx regex( st_co_p_c_regex_charset, st_co_i_regex_flags ) ;

   /*----------------------------------------------------------------------*/
   if( !regex.IsValid() ) { wxFAIL ; return( wxEmptyString ) ; }

   /*----------------------------------------------------------------------*/
   if( regex.Matches( s_buffer ) )
   {  return( regex.GetMatch( s_buffer, 1 ) ) ; }
   /*----------------------------------------------------------------------*/
   return( wxEmptyString ) ;
   /*----------------------------------------------------------------------*/
}

/*-------------------------------------------------------------------------*/
int CFileInit_html::tag_regex( const wxString &s_buffer,
                               const char * const co_p_c_regex, int i_col
                             )
{
   /*----------------------------------------------------------------------*/
   wxRegEx regex( co_p_c_regex, st_co_i_regex_flags ) ;

   /*----------------------------------------------------------------------*/
   if( !regex.IsValid() )
   {  wxLogError( "Pb exp html tag_regex #%s#", co_p_c_regex ) ;
      return( -1 ) ;
   }

   /*----------------------------------------------------------------------*/
   if( regex.Matches( s_buffer ) && m_fi.reserve_col( i_col ) )
   {
      /*-------------------------------------------------------------------*/
      wxString s_val = st_html_conv_html( regex.GetMatch( s_buffer, 1 ) ) ;
      /*-------------------------------------------------------------------*/
      if( m_fi.prepare_string( s_val ) > 0 )
      {  m_f.val_s( i_col ) = s_val ; }
      /*-------------------------------------------------------------------*/
   }
   /*----------------------------------------------------------------------*/
   return( 0 ) ;
   /*----------------------------------------------------------------------*/
}

/*-------------------------------------------------------------------------*/
int CFileInit_html::html_read()
{
   /*----------------------------------------------------------------------*/
   char     tb_c_buffer[ 2 * 1024 ]                  ;
   size_t   sz_buffer_len                            ;
   wxString s_buffer                                 ;
   wxString s_conv_charset                           ;
   const wxString co_s_def_char_conv( "ISO-8859-1" ) ;
   wxCSConv char_conv( co_s_def_char_conv )          ;
   int      i_col                                    ;
   /*--( To detect if any data has been found )----------------------------*/
   const size_t  co_sz_as_val_size_before = m_f.get_map_s_val().size() ;
   /*----------------------------------------------------------------------*/
   wxRegEx  regex_meta    ;
   wxRegEx  regex_name    ;
   wxRegEx  regex_content ;

   /*--( Load the text file )----------------------------------------------*/
   sz_buffer_len = sizeof( tb_c_buffer ) ;
   if( m_fa.read_buffer_max( sz_buffer_len, tb_c_buffer ) != 0 )
   {  return( -1 ) ; }

   /*-----------------------------------------------------------------------+
   ! And convert it to the right charset if specified.                      !
   ! If the buffer limit is not on a character boundary, the conversion can !
   ! fail.                                                                  !
   +-----------------------------------------------------------------------*/
   s_buffer = wxString( tb_c_buffer, char_conv, sz_buffer_len ) ;
   s_conv_charset = st_html_get_charset( s_buffer ) ;

   /*--( Reload if specified )---------------------------------------------*/
   if( !s_conv_charset.empty() )
   {
      /*-------------------------------------------------------------------*/
      m_f.val_s( COL_DOC_INFO ) = s_conv_charset ;
      /*--( "Reload" is only done if the charset is not the default one )--*/
      if( s_conv_charset.CmpNoCase( co_s_def_char_conv ) != 0 )
      {  /*--( Reload only done if charset ok )----------------------------*/
         char_conv = wxCSConv( s_conv_charset ) ;
         if( char_conv.IsOk() )
         {  s_buffer = wxString( tb_c_buffer, char_conv, sz_buffer_len ) ; }
         /*----------------------------------------------------------------*/
      }
      /*-------------------------------------------------------------------*/
   }

   /*--( Extract the title )-----------------------------------------------*/
   if( tag_regex( s_buffer, st_co_p_c_regex_title, COL_DOC_TITLE ) != 0 )
   {  return( -2 ) ; }

   /*--( Then the "base" )-------------------------------------------------*/
   if( tag_regex( s_buffer, st_co_p_c_regex_base, COL_DOC_URL ) != 0 )
   {  return( -3 ) ; }

   /*-----------------------------------------------------------------------+
   ! Extraction of the "meta" information                                   !
   +-----------------------------------------------------------------------*/
   if( !regex_meta.Compile( st_co_p_c_regex_meta, st_co_i_regex_flags ) )
   {  wxFAIL ; return( -4 ) ; }
   /*----------------------------------------------------------------------*/
   if( !regex_name.Compile( st_co_p_c_regex_name, st_co_i_regex_flags ) )
   {  wxFAIL ; return( -5 ) ; }
   /*----------------------------------------------------------------------*/
   if( !regex_content.Compile( st_co_p_c_regex_content, st_co_i_regex_flags))
   {  wxFAIL ; return( -6 ) ; }

   /*----------------------------------------------------------------------*/
   wxString s_meta        ;
   size_t   sz_meta_start ;
   size_t   sz_meta_len   ;
   size_t   sz_info_start ;
   size_t   sz_info_len   ;
   wxString s_var         ;
   wxString s_val         ;

   /*-----------------------------------------------------------------------+
   ! Scan of all meta definitions.                                          !
   ! Using pointers seems useless as "wxRegex::Matches" internally converts !
   ! its source to a wxString                                               !
   +-----------------------------------------------------------------------*/
   while(    regex_meta.Matches( s_buffer )
          && regex_meta.GetMatch( &sz_meta_start, &sz_meta_len )
        )
   {
      /*-------------------------------------------------------------------*/
      s_meta.assign( s_buffer, sz_meta_start, sz_meta_len ) ;
      s_buffer.erase( 0, sz_meta_start + sz_meta_len ) ;

      /*--( Search for the "name" in the meta string )---------------------*/
      if(    !regex_name.Matches( s_meta )
          || !regex_name.GetMatch( &sz_info_start, &sz_info_len )
        )
      {  continue ; }
      /*-------------------------------------------------------------------*/
      if( sz_info_len == 0 ) { continue ; }

      /*--( Extract the name and apply conversions )-----------------------*/
      s_var = st_html_extr_meta_string(
                                    s_meta.Mid( sz_info_start + sz_info_len )
                                      ) ;

      /*--( "Computation" of the column associated to this name )----------*/
      i_col = st_html_conv_tag_col( s_var ) ;
      if( m_fi.is_col_reserved( i_col ) ) { continue ; }

      /*--( Look for the "content" )---------------------------------------*/
      if(    !regex_content.Matches( s_meta )
          || !regex_content.GetMatch( &sz_info_start, &sz_info_len )
        )
      {  continue ; }
      if( !m_fi.reserve_col( i_col ) ) { continue ; }

      /*--( And extract the string associated to it )----------------------*/
        s_val
      = st_html_conv_html( st_html_extr_meta_string(
                                    s_meta.Mid( sz_info_start + sz_info_len )
                                                   )
                         ) ;

      /*-------------------------------------------------------------------*/
      if( !s_val.empty() && m_fi.prepare_string( s_val ) > 0 )
      {
         /*----------------------------------------------------------------*/
         wxDateTime dt ;
         /*--( If not a date or if the date is invalid it is just stored )-*/
         if(    (    i_col == COL_DOC_CRE_DATE
                  || i_col == COL_DOC_MOD_DATE
                )
             && sr::init_date_ymdhms( s_val, "%4d%2d%2d", dt ) == 0
           )
         {  m_fi.init_date( i_col, dt ) ; }
         else
         {  m_f.val_s( i_col ) = s_val ; }
         /*----------------------------------------------------------------*/
      }
      /*-------------------------------------------------------------------*/
   }

   /*--( Something should have been found )--------------------------------*/
   if( m_f.get_map_s_val().size() == co_sz_as_val_size_before )
   {  return( -7 ) ; }

   /*----------------------------------------------------------------------*/
   return( 0 ) ;
   /*----------------------------------------------------------------------*/
}

/*-------------------------------------------------------------------------*/
int CFileInit::init_html()
{
   /*----------------------------------------------------------------------*/
   m_s_type_det = "html" ;
   /*----------------------------------------------------------------------*/
   return( CFileInit_html( *this ).html_read() ) ;
   /*----------------------------------------------------------------------*/
}

/*-------------------------------------------------------------------------*/



/*==========================================================================+
!                      End of file CFileInit_html.cpp                       !
+==========================================================================*/
