# Perlfect Search - tools.pl # Mapping of Numbers to HTML entities and other useful functions # Hash taken from htdig's SGMLEntities.cc (shortened) #$rcs = ' $Id: tools.pl,v 1.18 2004/11/12 20:25:00 daniel Exp $ ' ; # Copyright (C) 1999-2002 Giorgos Zervas and # Daniel Naber # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or (at # your option) any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 # USA my $http_max_pages_counter = 0; # Remove some HTML special characters from a string. This is necessary # to avoid cross site scripting attacks. # See http://www.cert.org/advisories/CA-2000-02.html sub cleanup { my $str = $_[0]; if( ! defined($str) ) { return ""; } $str =~ s/[<>"'&]/ /igs; return $str; } # Escape some HTML special characters in a string. This is necessary # to avoid cross site scripting attacks. # See http://www.cert.org/advisories/CA-2000-02.html sub html_escape { my $str = $_[0]; if( ! defined($str) ) { return ""; } $str =~ s/&/&/igs; $str =~ s//>/igs; $str =~ s/"/"/igs; $str =~ s/'/'/igs; return $str; } sub init_config { if( $DOCUMENT_ROOT !~ m#/$# ) { # add ending slash if necessary $DOCUMENT_ROOT = $DOCUMENT_ROOT."/"; } $BASE_URL =~ s/\/$//; # remove trailing slash } sub init_robot_check { my $base = shift; if( $ROBOT_AGENT && $HTTP_START_URL ) { eval "use WWW::RobotRules"; if( $@ ) { die("Cannot use robots.txt, maybe WWW::RobotRules is not installed? $!"); } $main::robot = WWW::RobotRules->new($ROBOT_AGENT); my $url = "$base/robots.txt"; print "Loading $url...\n"; my $http_user_agent = LWP::UserAgent->new; my $robots_txt; (undef, undef, undef, $robots_txt) = get_url($http_user_agent, $url); if( $robots_txt ) { $main::robot->parse($url, $robots_txt); } else { warn("Not using any robots.txt.\n"); } } } sub isHTML { my $filename = shift; my $ext = get_suffix($filename); if( $ext ) { return grep(/^\Q$ext\E$/i, @HIGHLIGHT_EXT); } else { return 0; } } sub get_suffix { my $filename = shift; if( $filename =~ m/\?/ ) { $filename =~ s/\?.*$//; } my ($suffix) = ($filename =~ m/\.([^.]*)$/); return $suffix; } sub filterFile { my $filename = shift; my $ext = shift; my $buffer; my @args = split(/\s+/, $EXT_FILTER{$ext}); # don't allow any filename for security reasons: if( $filename !~ m/^[\\\/a-zA-Z0-9_.: +-]*$/ || $filename =~ m/\.\./ ) { print STDERR "Ignoring '$filename': illegal characters in filename\n"; return ""; } foreach (@args) { if( $_ eq 'FILENAME' ) { $_ =~ s/FILENAME/"$filename"/g; } } my $command = join(' ', @args); open(CMD, "$command|") || warn("Cannot execute '$command': $!") && return ""; while( ) { $buffer .= $_; } close(CMD); return $buffer; } sub debug { my $str = shift; if( $HTTP_DEBUG && $ENV{'REQUEST_METHOD'} ) { print $str; } elsif( $HTTP_DEBUG && ! $ENV{'REQUEST_METHOD'} ) { print STDERR $str; } } sub error { my $str = shift; if( $ENV{'REQUEST_METHOD'} ) { print $str; } else { print STDERR $str; } } # Fetch URL via http, return real URL (differs only in case of redirect) and # document's contents. Return nothing in case of error or unwanted Content-Type sub get_url { my $http_user_agent = shift; my $url = shift; my $search_mode = shift; # $search_mode = don't show debugging # Avoid endless loops: if( $http_max_pages_counter >= $HTTP_MAX_PAGES ) { error("Error: Ignoring '$url': \$HTTP_MAX_PAGES=$HTTP_MAX_PAGES limit reached.\n"); return; } $http_max_pages_counter++; my $request = HTTP::Request->new(GET => $url); my $response = $http_user_agent->request($request); if( $response->is_error ) { error("Error: Couldn't get '$url': response code " .$response->code. "\n"); return; } if( $response->headers_as_string =~ m/^Content-Type:\s*(.+)$/im ) { my $content_type = $1; $content_type =~ s/^(.*?);.*$/$1/; # ignore possible charset value if( ! grep(/^$content_type$/i, @HTTP_CONTENT_TYPES) ) { debug("Ignoring '$url': content-type '$content_type'\n"); return; } } my $buffer = $response->content; my $size = length($buffer); debug("Fetched '$url', $size bytes\n") if( ! $search_mode ); # Maybe we are we redirected, so use the new URL. # Note: this also uses , so href="..." has to point # to the page itself, not to the directory (even though the latter # will work okay in browsers): $url = $response->base; my $date_last_modified = $response->last_modified; return ($url, $date_last_modified, $size, $buffer); } ## Are there meta tags that forbid visiting this page / ## following its URLs? Returns "", "none", "noindex" or "nofollow" sub robot_meta_tag { my $content = shift; my $meta_tags = ""; while( ${$content} =~ m//igs ) { my $tag = $1; if( $tag =~ m/name\s*=\s*"robots"/is ) { my ($value) = ($tag =~ m/content\s*=\s*"(.*?)"/igs); if( $value =~ m/none/is ) { $meta_tags = "none"; } elsif( $value =~ m/noindex/is && $value =~ m/nofollow/is ) { $meta_tags = "none"; } elsif( $value =~ m/noindex/is ) { $meta_tags = "noindex"; } elsif( $value =~ m/nofollow/is ) { $meta_tags = "nofollow"; } } } return $meta_tags; } # Load the user's list of (common) words that should not be indexed. # Use a hash so lookup is faster. Well-chosen stopwords can make # indexing faster. sub load_stopwords { my %stopwords; open(FILE, $STOPWORDS_FILE) or (warn "Cannot open '$STOPWORDS_FILE': $!" and return); while () { chomp; $_ =~ s/\r//g; # get rid of carriage returns $stopwords{$_} = 1; } close(FILE); return %stopwords; } my $special_chars; # special characters we have to replace # Build list of special characters that will be replaced in normalize(), # put this list in global variable $special_chars. sub build_char_string { foreach my $number (keys %entities) { $special_chars .= chr($number); } } # Represent all special characters as the character they are based on. sub remove_accents { my $buffer = $_[0]; # Special cases: $buffer =~ s/þ/th/igs; $buffer =~ s/ð/d/igs; $buffer =~ s/ß/ss/igs; # Now represent special characters as the characters they are based on: $buffer =~ s/&(..?)(grave|acute|circ|tilde|uml|ring|cedil|slash|lig);/$1/igs; return $buffer; } # Represent all special characters as HTML entities like &; sub normalize_special_chars { my $buffer = $_[0]; # There may be special characters that are not encoded, so encode them: $buffer =~ s/([$special_chars])/"&#".ord($1).";"/gse; # Special characters can be encoded using hex values: $buffer =~ s/&#x([\dA-F]{2});/"&#".hex("0x".$1).";"/igse; # Special characters may be encoded with numbers, undo that (use the if() to avoid warnings): $buffer =~ s/&#(\d\d\d);/if( $1 >= 192 && $1 <= 255 ) { "&$entities{$1};"; }/gse; return $buffer; } %entities = ( 192 => 'Agrave', # capital A, grave accent 193 => 'Aacute', # capital A, acute accent 194 => 'Acirc', # capital A, circumflex accent 195 => 'Atilde', # capital A, tilde 196 => 'Auml', # capital A, dieresis or umlaut mark 197 => 'Aring', # capital A, ring 198 => 'AElig', # capital AE diphthong (ligature) 199 => 'Ccedil', # capital C, cedilla 200 => 'Egrave', # capital E, grave accent 201 => 'Eacute', # capital E, acute accent 202 => 'Ecirc', # capital E, circumflex accent 203 => 'Euml', # capital E, dieresis or umlaut mark 205 => 'Igrave', # capital I, grave accent 204 => 'Iacute', # capital I, acute accent 206 => 'Icirc', # capital I, circumflex accent 207 => 'Iuml', # capital I, dieresis or umlaut mark 208 => 'ETH', # capital Eth, Icelandic (Dstrok) 209 => 'Ntilde', # capital N, tilde 210 => 'Ograve', # capital O, grave accent 211 => 'Oacute', # capital O, acute accent 212 => 'Ocirc', # capital O, circumflex accent 213 => 'Otilde', # capital O, tilde 214 => 'Ouml', # capital O, dieresis or umlaut mark 216 => 'Oslash', # capital O, slash 217 => 'Ugrave', # capital U, grave accent 218 => 'Uacute', # capital U, acute accent 219 => 'Ucirc', # capital U, circumflex accent 220 => 'Uuml', # capital U, dieresis or umlaut mark 221 => 'Yacute', # capital Y, acute accent 222 => 'THORN', # capital THORN, Icelandic 223 => 'szlig', # small sharp s, German (sz ligature) 224 => 'agrave', # small a, grave accent 225 => 'aacute', # small a, acute accent 226 => 'acirc', # small a, circumflex accent 227 => 'atilde', # small a, tilde 228 => 'auml', # small a, dieresis or umlaut mark 229 => 'aring', # small a, ring 230 => 'aelig', # small ae diphthong (ligature) 231 => 'ccedil', # small c, cedilla 232 => 'egrave', # small e, grave accent 233 => 'eacute', # small e, acute accent 234 => 'ecirc', # small e, circumflex accent 235 => 'euml', # small e, dieresis or umlaut mark 236 => 'igrave', # small i, grave accent 237 => 'iacute', # small i, acute accent 238 => 'icirc', # small i, circumflex accent 239 => 'iuml', # small i, dieresis or umlaut mark 240 => 'eth', # small eth, Icelandic 241 => 'ntilde', # small n, tilde 242 => 'ograve', # small o, grave accent 243 => 'oacute', # small o, acute accent 244 => 'ocirc', # small o, circumflex accent 245 => 'otilde', # small o, tilde 246 => 'ouml', # small o, dieresis or umlaut mark 248 => 'oslash', # small o, slash 249 => 'ugrave', # small u, grave accent 250 => 'uacute', # small u, acute accent 251 => 'ucirc', # small u, circumflex accent 252 => 'uuml', # small u, dieresis or umlaut mark 253 => 'yacute', # small y, acute accent 254 => 'thorn', # small thorn, Icelandic 255 => 'yuml', # small y, dieresis or umlaut mark ); 1;