#!/usr/bin/perl
#
#           RiSearch SQL
#
# web search engine, version 0.1
# (c) Sergej Tarasov, 2000-2002
#
# Homepage: http://risearch.org/
# email: risearch@risearch.org
# Last modified: 29.07.2002


use DBI;
use LWP::UserAgent;
use HTML::LinkExtor;
use URI::URL;
require './config.pl';
require './tables.pl';


@to_visit = ();
push(@to_visit,@start_url);
%visited_url = ();


#DEFINE CONSTANTS
$cfn = 0;
$cwn = 0;
$kbcount = 0;

$dbh = DBI->connect("DBI:mysql:${DATABASE}:${DBSERVER}", $USERNAME, $PASSWORD);
create_tables();
$dbh->do("LOCK TABLES documents WRITE, words WRITE, word_link WRITE");

$code = "\${\$_[0]} =~ tr/-a-zA-Z$CAP_LETTERS$LOW_LETTERS$numbers/ /cs;";
$remove_non_alphabetic = eval "sub { $code }";

$code = "\${\$_[0]} =~ tr/A-Z$CAP_LETTERS/a-z$LOW_LETTERS/;";
$to_lower_case = eval "sub { $code }";

if (exists($ENV{'GATEWAY_INTERFACE'})) {print "Content-Type: text/plain\n\n"}


$time1 = time;
@time=localtime($time1);
$time="$time[2]:$time[1]:$time[0]";
print "Scan started: $time\n";

&start_spider;

$time4 = time;
@time=localtime($time4);
$time="$time[2]:$time[1]:$time[0]";
print "\nIndexing finished: $time\n";


@time=gmtime($time4-$time1);
$time="$time[2]:$time[1]:$time[0]";
print "Total time: $time sec.\n";

print "\n$doc_id files are indexed, $word_id unique words are stored in database\n";

#=====================================================================

sub start_spider {

while (1) {
    $get_url = shift @to_visit;
    if (exists($visited_url{$get_url})) { next }
    if ($get_url eq "") { last }
    
    $hdrs = new HTTP::Headers(Accept => 'text/html',
                              User-Agent => 'RiSpider/1.0');
    
    $url = new URI::URL($get_url);
    print "$get_url\n";

    $req = new HTTP::Request(GET, $url, $hdrs);
    $ua = new LWP::UserAgent;

    $resp = $ua->request($req);
    $BASE = $resp->base;
    print "$BASE\n";
    $visited_url{$get_url}++;
    if ( ($BASE ne $get_url)  &&  exists($visited_url{$BASE})) { print "Already visited\n\n"; next }
    if ($BASE ne $get_url) { $visited_url{$BASE}++ }

    if ($resp->is_success) {
        $data = $resp->content;
        &extract_links( $data );
        &index_file($data,$get_url);
    } else {
        print $resp->message;
    }
    print "Done\n\n";
}

}   # end sub start_spider

#=====================================================================

sub extract_links {
    my $data = shift;
    
    $parser = HTML::LinkExtor->new(undef,$BASE);
    $parser->parse($data)->eof;
    @links = $parser->links;

    foreach $linkarray (@links) {
        my @element = @$linkarray;
        my $el_type = shift @element;
        while (@element) {
            my ($attr_name, $attr_value) = splice(@element, 0, 2);
            my $new_link = $attr_value;
            my $skip_this_link = 1;
                foreach $allow_url (@allow_url) {
                    if ($new_link =~ /$allow_url/) { $skip_this_link = 0 }
                }
                if ($skip_this_link) { next }
            $new_link =~ s/[#?].*?$//;
            if ($new_link !~ /$file_ext/i && $new_link !~ /\/[^.]*?$/) { next }
            if ($new_link =~ /\/$no_index_dir\//) { next }
            if ($new_link =~ /$no_index_files/) { next }
            $attr_value =~ s/#.*?$//;
            if ($cut_default_filenames eq 'YES') { $attr_value =~ s|$default_filenames|/|io }
            
            if ($el_type eq "a" && $attr_name eq "href") {
            	push(@to_visit,$attr_value);
            }
            if ($el_type eq "frame" && $attr_name eq "src") {
            	push(@to_visit,$attr_value);
            }
            if ($el_type eq "area" && $attr_name eq "href") {
            	push(@to_visit,$attr_value);
            }
        }
    }
}

#=====================================================================

sub index_file {
    $html_text=$_[0];
    my $url=$_[1];

    $non_parse = 0;
    if ($url =~ m|$non_parse_ext$|io) {$non_parse++}
    $size = int length($html_text)/1024;
    $kbcount += $size;
    $cfn++;
    print "$cfn -> $filename; totalsize -> $kbcount\n";

# Delete parts of document, which should not be indexed
    foreach $key (keys %no_index_strings) {
    	$val = $no_index_strings{$key};
        $html_text =~ s/$key.*?$val/ /gs; 
    }

    if ($non_parse == 0) {
        $html_text =~ s/<!--.*?-->/ /gs;
        $html_text =~ s/<[Ss][Cc][Rr][Ii][Pp][Tt].*?<\/[Ss][Cc][Rr][Ii][Pp][Tt]>/ /gs;
        $html_text =~ s/<[Ss][Tt][Yy][Ll][Ee].*?<\/[Ss][Tt][Yy][Ll][Ee]>/ /gs;

        $html_text =~ s#<[Tt][Ii][Tt][Ll][Ee]>\s*(.*?)\s*</[Tt][Ii][Tt][Ll][Ee]># #s;
        $title = $1;
        $title =~ s/\s+/ /gs;
        $TITLE = $title;
        if ($truncate_title eq "YES") { $TITLE = substr($TITLE,0,64) }
        if ($TITLE eq "") {$TITLE = "No title"};

        if ($use_META eq "YES") { ($keywords,$description) = &get_META_info(\$html_text) }
        if ($use_ALT eq "YES") {
    	     $alt = join " ", ($html_text =~ m/<[Ii][Mm][Gg][^>]+[Aa][Ll][Tt]="([^"]*)"[^>]*>/gs );
        }
        if ($del_hyphen eq "YES") { &del_hyphen(\$html_text) }
        $html_text =~ s/<[^>]*>/ /gs;
        if ($use_esc eq "YES") { $html_text =~ s/(&.*?;)/&esc2char($1)/egs; }
        $html_text =~ s/\s+/ /gs;
        if (($use_META_descr eq "YES") & ($description ne "")) {
            $descript = substr($description,0,$descr_size);
        } else {
            (my $dum = substr($html_text,0,1024)) =~ s/\s+/ /gs;
            $descript = substr($dum,0,$descr_size);
        }
        $html_text .= " ".$title." ".$keywords." ".$decription." ".$alt;
    } else {
    	$html_text =~ s/\s+/ /gs;
        $title = "No title";
        $descript = substr($html_text,0,$descr_size);
    }
    
    &$remove_non_alphabetic(\$html_text);
    &$to_lower_case(\$html_text);
    $wwd = join " ", ($html_text =~ m/([^- ]+-[^ ]+[^- ])/gs);
    $html_text =~ tr/-/ /;
    $html_text .= " ".$wwd;
    $pos = pack("N",tell(FINFO));
    $TITLE =~ s/:+/:/g;
    $descript =~ s/:+/:/g;

    $q_url = $dbh->quote($url);
    $q_TITLE = $dbh->quote($TITLE);
    $q_descript = $dbh->quote($descript);
    
    $ins = $dbh->do("INSERT INTO documents (filename,title,description,file_size)
                    VALUES ($q_url,$q_TITLE,$q_descript,$size)")
                    or print "Document not inserted...<BR>";
                    
    $doc_id = $dbh->selectrow_array("SELECT LAST_INSERT_ID()");

    my %seen = ();
    @seen{split (/\s+/,$html_text)} = ();
    print "Put info into DB\n";
    foreach $word (keys %seen) {
        if (length($word) < $min_length) { next }
        if (length($word) > $max_length) { $word = substr($word,0,$max_length) }
        if (exists($stop_words{$word})) { next }
        
        $word_id = $dbh->selectrow_array("SELECT word_id FROM
        							words WHERE word='$word'");
        
        if( !$word_id) {
            $dbh->do("INSERT INTO words (word) VALUES ('$word')") or print "Can't insert\n";
            $word_id = $dbh->selectrow_array("SELECT LAST_INSERT_ID()");
        }
        
        $ins = $dbh->do("INSERT INTO word_link (word_id, document_id)
                         VALUES ('$word_id','$doc_id')") or print "Can't insert\n";
        
        
    }


};     # sub index_file
#=====================================================================

sub get_META_info {
    my ($html) = @_;
    $keywords    = ($$html =~ s/<[Mm][Ee][Tt][Aa]\s*[Nn][Aa][Mm][Ee]=\"?[Kk][Ee][Yy][Ww][Oo][Rr][Dd][Ss]\"?\s*[Cc][Oo][Nn][Tt][Ee][Nn][Tt]=\"?([^\"]*)\"?>//s) ? $1 : '';
    $description = ($$html =~ s/<[Mm][Ee][Tt][Aa]\s*[Nn][Aa][Mm][Ee]=\"?[Dd][Ee][Ss][Cc][Rr][Ii][Pp][Tt][Ii][Oo][Nn]\"?\s*[Cc][Oo][Nn][Tt][Ee][Nn][Tt]=\"?([^\"]*)\"?>//s) ? $1 : '';
    return ($keywords, $description)
}
#=====================================================================

sub del_hyphen {
    my ($text) = @_;
    local $/;
    $$text =~ s/-\n//gs;
}
#=====================================================================

sub my_die {
   my ($str) = @_;
   print "$str\n";
   die
}
#===================================================================
