#!/usr/bin/perl
#
#           RiSearch SQL
#
# web search engine, version 0.1
# (c) Sergej Tarasov, 2000-2002
#
# Homepage: http://risearch.org/
# email: risearch@risearch.org
# Last modified: 29.07.2002


use DBI;
use LWP::UserAgent;
use HTML::LinkExtor;
use URI::URL;
require './config.pl';


print "Content-Type: text/html\n\n";



$code = "\${\$_[0]} =~ tr/-a-zA-Z$CAP_LETTERS$LOW_LETTERS$numbers/ /cs;";
$remove_non_alphabetic = eval "sub { $code }";

$code = "\${\$_[0]} =~ tr/A-Z$CAP_LETTERS/a-z$LOW_LETTERS/;";
$to_lower_case = eval "sub { $code }";


if($ENV{'REQUEST_METHOD'} eq 'GET'){ 
   $query=$ENV{'QUERY_STRING'};
}
elsif($ENV{'REQUEST_METHOD'} eq 'POST'){
   read(STDIN, $query, $ENV{'CONTENT_LENGTH'});
}


my @formfields=split /&/,$query;
foreach(@formfields){
   if(/^url=(.*)/) {$ndquery=$1}
}
my $get_url = urldecode($ndquery);

&read_template("template.htm");
print &print_template("header");


if ($get_url !~ m|http://[a-zA-Z0-9/#~:.?+=&%@!\-]+|) {
print "Error in URL";
print &print_template("footer");

exit;    
}

$get_url =~ m|http://([^/]+)|;
my $host = $1;


$dbh = DBI->connect("DBI:mysql:${DATABASE}:${DBSERVER}", $USERNAME, $PASSWORD);

    index_page($get_url);

$dbh->disconnect;


print &print_template("footer");

#===================================================================

sub index_page {
    my $get_url = shift;
    
    my $hdrs = new HTTP::Headers(Accept => 'text/html');
    my $req = new HTTP::Request('GET', $get_url, $hdrs);
    my $ua = new LWP::UserAgent;

    my $resp = $ua->request($req);
    my $BASE = $resp->base;

    if ($resp->is_success) {
        my $data = $resp->content;
        
        index_file($data, $get_url);
        
    } else {
        print $resp->message;
        print print_template($templates{"footer"});
        exit;
 
    }
    
}
#===================================================================

sub index_file {
    my $html_text=$_[0];
    my $url=$_[1];

    $non_parse = 0;
    if ($url =~ m|$non_parse_ext$|io) {$non_parse++}
    $size = int length($html_text)/1024;
    $kbcount += $size;

# Delete parts of document, which should not be indexed
    foreach $key (keys %no_index_strings) {
    	$val = $no_index_strings{$key};
        $html_text =~ s/$key.*?$val/ /gs; 
    }

    if ($non_parse == 0) {
        $html_text =~ s/<!--.*?-->/ /gs;
        $html_text =~ s/<[Ss][Cc][Rr][Ii][Pp][Tt].*?<\/[Ss][Cc][Rr][Ii][Pp][Tt]>/ /gs;
        $html_text =~ s/<[Ss][Tt][Yy][Ll][Ee].*?<\/[Ss][Tt][Yy][Ll][Ee]>/ /gs;

        $html_text =~ s#<[Tt][Ii][Tt][Ll][Ee]>\s*(.*?)\s*</[Tt][Ii][Tt][Ll][Ee]># #s;
        $title = $1;
        $title =~ s/\s+/ /gs;
        $TITLE = $title;
        if ($truncate_title eq "YES") { $TITLE = substr($TITLE,0,64) }
        if ($TITLE eq "") {$TITLE = "No title"};

        if ($use_META eq "YES") { ($keywords,$description) = &get_META_info(\$html_text) }
        if ($use_ALT eq "YES") {
    	     $alt = join " ", ($html_text =~ m/<[Ii][Mm][Gg][^>]+[Aa][Ll][Tt]="([^"]*)"[^>]*>/gs );
        }
        if ($del_hyphen eq "YES") { &del_hyphen(\$html_text) }
        $html_text =~ s/<[^>]*>/ /gs;
        if ($use_esc eq "YES") { $html_text =~ s/(&.*?;)/&esc2char($1)/egs; }
        $html_text =~ s/\s+/ /gs;
        if (($use_META_descr eq "YES") & ($description ne "")) {
            $descript = substr($description,0,$descr_size);
        } else {
            (my $dum = substr($html_text,0,1024)) =~ s/\s+/ /gs;
            $descript = substr($dum,0,$descr_size);
        }
        $html_text .= " ".$title." ".$keywords." ".$decription." ".$alt;
    } else {
    	$html_text =~ s/\s+/ /gs;
        $title = "No title";
        $descript = substr($html_text,0,$descr_size);
    }
    
    &$remove_non_alphabetic(\$html_text);
    &$to_lower_case(\$html_text);
    $wwd = join " ", ($html_text =~ m/([^- ]+-[^ ]+[^- ])/gs);
    $html_text =~ tr/-/ /;
    $html_text .= " ".$wwd;
    $pos = pack("N",tell(FINFO));
    $TITLE =~ s/:+/:/g;
    $descript =~ s/:+/:/g;

    $q_url = $dbh->quote($url);
    $q_TITLE = $dbh->quote($TITLE);
    $q_descript = $dbh->quote($descript);
    
    $ins = $dbh->do("INSERT INTO documents (filename,title,description,file_size)
                    VALUES ($q_url,$q_TITLE,$q_descript,$size)")
                    or print "Document not inserted...<BR>";
                    
    $doc_id = $dbh->selectrow_array("SELECT LAST_INSERT_ID()");

    my %seen = ();
    @seen{split (/\s+/,$html_text)} = ();
    foreach $word (keys %seen) {
        if (length($word) < $min_length) { next }
        if (length($word) > $max_length) { $word = substr($word,0,$max_length) }
        if (exists($stop_words{$word})) { next }
        
        $word_id = $dbh->selectrow_array("SELECT word_id FROM
        							words WHERE word='$word'");
        
        if( !$word_id) {
            $dbh->do("INSERT INTO words (word) VALUES ('$word')") or print "Can't insert\n";
            $word_id = $dbh->selectrow_array("SELECT LAST_INSERT_ID()");
        }
        
        $ins = $dbh->do("INSERT INTO word_link (word_id, document_id)
                         VALUES ('$word_id','$doc_id')") or print "Can't insert\n";
        
        
    }
    
    print "Page added";


};     # sub index_file
#=====================================================================

sub read_template {
    my ($filename) = @_;
    
    open TEMPLATE, $filename or print "Could not find template";
    local $/;
    my $template = <TEMPLATE>;
    close(TEMPLATE);
    
    while ( $template =~ m|<!-- RiSearch::([^:]+?)::start -->(.*?)<!-- RiSearch::\1::end -->|gs) {
    	$templates{$1} = $2;
    }   
    return 1;
}
#===================================================================

sub print_template {
    my $part = shift;
    my $template = $templates{$part};      
    my $rand_number = int (rand(256));
    
    $template =~ s|%query%|$query|gs;
    $template =~ s|%search_time%|$search_time|gs;
    $template =~ s|%query_statistics%|$query_statistics|gs;
    $template =~ s|%stpos%|$stpos+1|egs;
    $template =~ s|%url%|$url|gs;
    $template =~ s|%title%|$title|gs;
    $template =~ s|%size%|$size|gs;
    $template =~ s|%description%|$description|gs;
    $template =~ s|%rescount%|$rescount|gs;
    $template =~ s|%next_results%|$next_results|gs;
    $template =~ s|%rand_number%|$rand_number|gs;
    $template =~ s|%right_form\((.*?)\)%|&right_form($1)|egs;
    
    return $template;
}
#===================================================================

sub get_META_info {
    my ($html) = @_;
    $keywords    = ($$html =~ s/<[Mm][Ee][Tt][Aa]\s*[Nn][Aa][Mm][Ee]=\"?[Kk][Ee][Yy][Ww][Oo][Rr][Dd][Ss]\"?\s*[Cc][Oo][Nn][Tt][Ee][Nn][Tt]=\"?([^\"]*)\"?>//s) ? $1 : '';
    $description = ($$html =~ s/<[Mm][Ee][Tt][Aa]\s*[Nn][Aa][Mm][Ee]=\"?[Dd][Ee][Ss][Cc][Rr][Ii][Pp][Tt][Ii][Oo][Nn]\"?\s*[Cc][Oo][Nn][Tt][Ee][Nn][Tt]=\"?([^\"]*)\"?>//s) ? $1 : '';
    return ($keywords, $description)
}
#=====================================================================

sub del_hyphen {
    my ($text) = @_;
    local $/;
    $$text =~ s/-\n//gs;
}
#=====================================================================

sub my_die {
   my ($str) = @_;
   print "$str\n";
   rename("db/0_locked","db/0_unlocked");
   die
}
#===================================================================

sub urldecode{    
 my ($val)=@_;  
 $val=~s/\+/ /g;
 $val=~s/%([0-9A-H]{2})/pack('C',hex($1))/ge;
 return $val;
}
#===================================================================





