#!/usr/bin/perl
#
#           RiSearch SQL
#
# web search engine, version 0.1
# (c) Sergej Tarasov, 2000-2002
#
# Homepage: http://risearch.org/
# email: risearch@risearch.org
# Last modified: 29.07.2002




#===================================================================
#
#         Set variables below 
#
#===================================================================

# Database parameters
$DBSERVER="localhost";
$DATABASE="sql";
$USERNAME="sql";
$PASSWORD="sql";

# Directory where yours html files are located
# In most cases you may use path relative to the location of script
# Or use absolute path
# Type "./" for the current directory
$base_dir = ".";

# Base URL of your site
$base_url = "http://www.server.com/";


# Write queries to log file ("YES" or "NO")
# Please note, you should create directory "log" by hands
# script will not check, if there exist such directory
# Please edit some parameters in file "stat.pl"
$create_log = "YES";


#===================================================================
#
#         These variables are used by spider 
#
#===================================================================

# Starting URL (used by spider)
@start_url = qw(
http://www.search.com/index.html
http://www.yourserver.com/
);

# Spider will index only files from these servers
@allow_url = qw(
http://www.search.com/
http://www.yourserver.com/
http://www.server.com/
http://another.server.com/
);

# Starting URL (used by spider)
@start_url = qw(
http://www.search.ru/index.html
);

# Spider will index only files from these servers
@allow_url = qw(
http://www.search.ru/
);

#===================================================================
#
#     All other variables are optional. Script should work fine
#  with default settings.
#     These variables controls the indexing process.
#
#===================================================================

$verbose_output = "YES";

# File extensions to index
# Add "NONE" if you want to index files without extensions
$file_ext = 'html txt htm shtml php';
$non_parse_ext = 'txt';

# List of directories, which should not be indexed
$no_index_dir = 'img image temp tmp cgi-bin';

# List of files, which should not be indexed
$no_index_files = 'robots.txt dir1/no_index.html';

# minimum word length to index
$min_length = 3;
$max_length = 30;

# Index or not numbers (set   $numbers = ""   if you don't want to index numbers)
# You may add here other non-letter characters, which you want to index
$numbers = '0-9';

# Parts of documents, which should not be indexed
# Uncomment and edit, if you want to use this feature
$use_selective_indexing = "NO";
#%no_index_strings = (
#    q[<!-- No index start 1 -->] => q[<!-- No index end 1 -->],
#    q[<!-- No index start 2 -->] => q[<!-- No index end 2 -->],
#);

# Cut default filenames from URL ("YES" or "NO")
$cut_default_filenames = 'YES';
$default_filenames = 'index.htm index.html default.htm';

# Convert URL to lower case ("YES" or "NO")
$url_to_lower_case = 'NO';

# Translate escape chars (like &Egrave; or &#255;) ("YES" or "NO")
$use_esc = "YES";

# Index META tags ("YES" or "NO")
$use_META = "NO";

# Index IMG ALT tag ("YES" or "NO")
$use_ALT = "NO";

# Delete hyphen at the end of strings ("YES" or "NO")
$del_hyphen = "NO";

# List of stopwords ("YES" or "NO")
$use_stop_words = "YES";
@stop_words = qw(
and any are but can had has have her here him his
how its not our out per she some than that the their them then there
these they was were what you
);

#===================================================================
#
#     These variables controls the script output.
#
#===================================================================

# Number of results per page
$res_num=10;

# Define length of page description in output
# and use META description ("YES") or first "n" characters of page ("NO")
$descr_size = 256;
$use_META_descr = "NO";

# Truncate TITLE of page to 64 characters ("YES" or "NO")
$truncate_title = "NO";

#===================================================================
#
#   Change below only if you need multilanguage support
#   With default settings script will work with
#   English, Russian (win1251 encoding) and most European languages
#
#===================================================================

# Capital letters
$CAP_LETTERS = '\xC0-\xDF\xA8';

# Lower case letters
$LOW_LETTERS = '\xE0-\xFF\xB8';

# If you use Unicode characters in your site in the form
# &#NNNN; (where NNNN>255), uncomment and edit below
# Samples for different languages can be found in file unicode.txt

%code2char = (
# 1040 => "",
# 1041 => "",
# 1042 => "",
# 1043 => "",
);

#===================================================================
#
#            --- end of configuration --- 
#
# Please do not edit below this line unless you know what you do
#
#===================================================================

sub prepare_string {
    my $str = shift;
    $str =~ s/^\s+|\s+$//;
    $str =~ s/\s+/|/g;
    $str =~ s/\./\\\./g;
    $str = "(".$str.")";
    return $str;
}

if ($file_ext =~ /NONE/) {
    $file_ext =~ s/NONE//;
    $file_ext = prepare_string($file_ext);
    $file_ext = '(\.'.$file_ext.'|^[^.]+)$';
} else {
    $file_ext = prepare_string($file_ext);
    $file_ext = '\.'.$file_ext.'$';
}


$non_parse_ext = prepare_string($non_parse_ext);
$non_parse_ext = '\.'.$non_parse_ext.'$';

$no_index_dir = prepare_string($no_index_dir);

$no_index_files = prepare_string($no_index_files);

$default_filenames = prepare_string($default_filenames);
$default_filenames = '/'.$default_filenames.'$';

#===================================================================

%stop_words = ();
if ($use_stop_words eq "YES") {
    foreach $word (@stop_words) { $stop_words{$word} = "" }
}
#=====================================================================

if ($use_esc eq "YES") { &html_esc() }

sub html_esc {
    %html_esc = (
        "&Agrave;" => chr(192),
        "&Aacute;" => chr(193),
        "&Acirc;" => chr(194),
        "&Atilde;" => chr(195),
        "&Auml;" => chr(196),
        "&Aring;" => chr(197),
        "&AElig;" => chr(198),
        "&Ccedil;" => chr(199),
        "&Egrave;" => chr(200),
        "&Eacute;" => chr(201),
        "&Eirc;" => chr(202),
        "&Euml;" => chr(203),
        "&Igrave;" => chr(204),
        "&Iacute;" => chr(205),
        "&Icirc;" => chr(206),
        "&Iuml;" => chr(207),
        "&ETH;" => chr(208),
        "&Ntilde;" => chr(209),
        "&Ograve;" => chr(210),
        "&Oacute;" => chr(211),
        "&Ocirc;" => chr(212),
        "&Otilde;" => chr(213),
        "&Ouml;" => chr(214),
        "&times;" => chr(215),
        "&Oslash;" => chr(216),
        "&Ugrave;" => chr(217),
        "&Uacute;" => chr(218),
        "&Ucirc;" => chr(219),
        "&Uuml;" => chr(220),
        "&Yacute;" => chr(221),
        "&THORN;" => chr(222),
        "&szlig;" => chr(223),
        "&agrave;" => chr(224),
        "&aacute;" => chr(225),
        "&acirc;" => chr(226),
        "&atilde;" => chr(227),
        "&auml;" => chr(228),
        "&aring;" => chr(229),
        "&aelig;" => chr(230),
        "&ccedil;" => chr(231),
        "&egrave;" => chr(232),
        "&eacute;" => chr(233),
        "&ecirc;" => chr(234),
        "&euml;" => chr(235),
        "&igrave;" => chr(236),
        "&iacute;" => chr(237),
        "&icirc;" => chr(238),
        "&iuml;" => chr(239),
        "&eth;" => chr(240),
        "&ntilde;" => chr(241),
        "&ograve;" => chr(242),
        "&oacute;" => chr(243),
        "&ocirc;" => chr(244),
        "&otilde;" => chr(245),
        "&ouml;" => chr(246),
        "&divide;" => chr(247),
        "&oslash;" => chr(248),
        "&ugrave;" => chr(249),
        "&uacute;" => chr(250),
        "&ucirc;" => chr(251),
        "&uuml;" => chr(252),
        "&yacute;" => chr(253),
        "&thorn;" => chr(254),
        "&yuml;" => chr(255),
        "&nbsp;" => " ",
        "&amp;" => " ",
        "&quote;" => " ",
    );

}
#=====================================================================

sub esc2char {
    my ($esc) = @_;
    my $char = "";
    if ($esc =~ /&[a-zA-Z]*;/) { $char = $html_esc{$esc} }
    elsif ($esc =~ /&#([0-9]*);/) {
    	if ($1 <= 255) { $char = chr($1) }
    	else { $char = $code2char{$1} }
    } elsif ($esc =~ /&#x([0-9a-fA-F]*);/i) {
    	my $code = hex($1);
    	if ($code <= 255) { $char = chr($code) }
    	else { $char = $code2char{$code} }
    }	
    return $char;
}
#=====================================================================



1;