#!/usr/local/bin/perl5 # ice2-idx.pl - create index file # # ICE Version 1.4 beta # February 1997 # (C) Christian Neuss (neuss@hotmail.com) # # # Modified for Internet Marketing 1 # Documentation and path modification only, for ease of use in installing # on IM1 Servers. The only code modified was the addition of some variables at # the beginning for easy configuring on im1 servers only. Our thanks to Christian Neuss # for this fabulous script, which we offer and support for free. # # # Change USERID to your user ID # $userid="USERID"; # END OF BASIC CONFIGURATION # # ADVANCED USERS: # # To exclude directories from the search, put the full server paths of the # subdirectories. NOTE: once you exclude a directory, all of *its* subdirectories # are excluded. Just change the SUBDIR to your subdirectory and leave the rest alone. # # You must UNCOMMENT (remove the '#' from each line) for this to work. @excludedirs=( "/usr/local/www/data/$userid/cgi-bin", # "/usr/local/www/data/$userid/SUBDIR", # "/usr/local/www/data/$userid/SUBDIR", # "/usr/local/www/data/$userid/SUBDIR", ); ## 14MB data gives an index size 1.1MB at MAXPERCENT = 40 ## and 1.2MB at MAXPERCENT = 60 ## to do: apply iso2html _after_ the complete index was computed ## allow pre-defining a list of stoprwords (simple) # Have this script called up on a regular base via 'cron'. # If thats not possible, re-index manually whenever changes # to your document hierarchy have been made. # Make sure that if for security reasons, it is being executed # with a user id other then root, this user has # both read access to the html files and write access to # the index file. # Here's an example of a crontab entry (crontab syntax varies # between different platforms): # 1 20 * * * neuss /usr/httpd/ice-idx.pl >/dev/console 2>&1 #--- start of configuration --- put your changes here --- # NOTE: Depending on your Perl implementation, you may # have to use different path seperators in the following # paths when you are on a Macintosh or PC system. In that # case, a path may look like e.g. "usr:foo:bar" (Mac), or # "\\usr\\foo\\bar" resp. '\usr\foo\bar' (PC). # The physical directory/directories to scan for html-files. # It's better to supply a tailing "/" for each directory, # since otherwise automounting may not work. # Example: # @SEARCHDIRS=('/usr/www/dir','/tmp/html','/usr/foo/html-dir'); @SEARCHDIRS=( "/usr/local/www/data/$userid/", ); # Location of the index file. # Example: # $INDEXFILE='/usr/local/httpd/index.idx'; $INDEXFILE="/usr/local/www/data/$userid/cgi-bin/index.idx"; # The ICE indexer will support full international characters by # converting them to their html equivalent if $ISO is set. # This has a slightly negative impact on the indexing speed, so # set it to "y" only if you index files with 8 bit international # charcters. OTHERWISE DONT! iso2html seems to cause a memory # leak, causing the indexer to run forever. I'm working on it. $ISO="n"; # Type of system (for figuring out the path delimiting character) # that ice-idx.pl runs on. Select one of "UNIX", "MAC", or "PC" $TYPE="UNIX"; # Minimum length of word to be indexed $MINLEN=3; # Stop indexing a word that appears in over X percent of all files $MAXPERCENT=60; #--- end of configuration --- don't change anything below --- require "find.pl"; local(@allfiles,%freqlist); open(INDEX,">$INDEXFILE") || die("Cannot open $INDEXFILE: $!\n"); &find (@SEARCHDIRS); $count=0; foreach $name (@allfiles) { $found=""; foreach $dir (@excludedirs) { if($name =~ /^$dir/) { $found="y"; print STDERR "excluding [$name]\n"; } } if( ! $found ) { # print STDERR "indexing [$name]\n"; $lastpercent=$percent; $percent=int(100*$count/@allfiles); if($percent>$lastpercent) { print STDERR $percent,"% "; } &indexfile($name); $count++; # every 100th file until the 1000th... if((($count % 100) == 0) && ($count < 1000)) { # remove the most frequent words so far from the index &removefrequent; } } } &removefrequent; # print sorted list of words and their fileids foreach $w (sort keys(%index)){ print INDEX "$w ",$index{$w},"\n"; } print INDEX "--\n"; # print list of all files and their fileid local($dir,$prevdir,$name); foreach $w (sort keys(%files)){ if($files{$w} =~ m:(.*)/([^/]*)$:){ $prevdir = $dir; $name = $2; $dir = $1; if($prevdir ne $dir){ print INDEX "$dir\n"; } $title = $titles{$w}; $mtime = $mtimes{$w}; print INDEX "$w $name /$mtime $title\n"; } } ###system("ps -vx | egrep 'perl|MEM'"); # file name ends in .html or .htm or .shtml or .txt sub wanted { if(/.rtf$/ || /.html$/ || /.htm$/ || /.shtml$/ || /.txt/) { push(@allfiles,$name); } } # modifies %files sub removefrequent{ local($num,$tmp); $numfiles = keys(%files); foreach $w (keys(%index)){ ($tmp = $index{$w}) =~ s/[^ ]//g; $num = length($tmp); # don't index words in more then X % of the files if($num*100 > $MAXPERCENT*$numfiles){ print STDERR ("removing common word: $w [$num of $numfiles]\n"); $index{$w}="0"; } } } sub indexfile{ local($file)=@_; local($title,$intitle,$freq); unless (-r $file && open(fpin,"$file")){ # file readable? print STDERR "cannot read file [$file]\n"; ### XXX no printo return; } $fileno++; $fileid = sprintf ("%X ",$fileno); $files{$fileid}=$file; local($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size, $atime,$mtime,%dontcare) = stat($file); # set input separator to the tag close character ">" $/ = ">"; while(){ s/\s+/ /g; # fold whitespaces into a single blank s/([^\n])([^\n])/>\n\1/g; # .. and after every '>' foreach (split(/\n/,$_)){ # opening title tag if(m::i){ $intitle="y"; $title=""; } # closing title tag if(m::i){ $intitle=""; } # outside a tag or inside META tag => index word if(!//i ) { $_ = $1; ## print "FOUND META TAG $_\n"; } if( $ISO eq "y" ){ # convert iso 8bit to html ###print "1. [$_]\n"; $_ = &iso2html($_); ###print "2. [$_]\n"; } # if inside title if ($intitle){ tr/A-Za-z&;0-9\-/ /cs; $title.="$_"; } else { tr/A-Za-z&;/ /cs; foreach (split(/ /,$_)){ next unless (length($_)>=$MINLEN); # if too short skip if (/\;$/) { # get rid of trailing ";" that aren't part of &Xuml; s/((\w|\&[a-z,A-Z]+\;)+)\;?/$1/; } if(/^[A-Z][^A-Z]*$/){ # "Someword" to "someword" tr/A-Z/a-z/; } ###print "3. [$_]\n"; $freqlist{$_}++; } } } } } $file =~ tr/\n/ /s; # convert MAC and PC path seperators to UNIX style slashes if($TYPE eq "MAC"){ $file =~ s|:|/|g; } if($TYPE eq "PC") { $file =~ s|\\|/|g; } # on a MAC, add the leading slash if ($file =~ m/^[^\/]/) { $file = "/$file"; } $title =~ tr/\n/ /s; ### print INDEX "\@f $file\n"; ### print INDEX "\@t $title\n"; ### print INDEX "\@m $mtime\n"; foreach $w (sort keys(%freqlist)){ ###print INDEX "$freqlist{$w} $w\n"; if($index{$w} ne "0"){ $freq = $freqlist{$w}; $freq .= ":" unless length($freq)==1; $index{$w} .= $freq.$fileid; } ### print "4. $freqlist{$w} $w\n"; } $titles{$fileid}=$title; $mtimes{$fileid}=$mtime; undef %freqlist; close(fpin); } # iso2html - translate iso 8 bit characters to HTML # # Thanks to # Pierre Cormier (cormier.pierre@uqam.ca) # Universite du Quebec Montreal sub iso2html { local($input)=@_; unless(defined($isohtml[0])){ foreach (0..191) { $isohtml[$_] = pack("C",$_);} $isohtml[hex('c0')] = 'À'; $isohtml[hex('c1')] = 'Á'; $isohtml[hex('c2')] = 'Â'; $isohtml[hex('c3')] = 'Ã'; $isohtml[hex('c4')] = 'Ä'; $isohtml[hex('c5')] = 'Å'; $isohtml[hex('c6')] = 'Æ'; $isohtml[hex('c7')] = 'Ç'; $isohtml[hex('c8')] = 'È'; $isohtml[hex('c9')] = 'É'; $isohtml[hex('ca')] = 'Ê'; $isohtml[hex('cb')] = 'Ë'; $isohtml[hex('cc')] = 'Ì'; $isohtml[hex('cd')] = 'Í'; $isohtml[hex('ce')] = 'Î'; $isohtml[hex('cf')] = 'Ï'; $isohtml[hex('d0')] = 'Ð'; $isohtml[hex('d1')] = 'Ñ'; $isohtml[hex('d2')] = 'Ò'; $isohtml[hex('d3')] = 'Ó'; $isohtml[hex('d4')] = 'Ô'; $isohtml[hex('d5')] = 'Õ'; $isohtml[hex('d6')] = 'Ö'; $isohtml[hex('d7')] = '×'; $isohtml[hex('d8')] = 'Ø'; $isohtml[hex('d9')] = 'Ù'; $isohtml[hex('da')] = 'Ú'; $isohtml[hex('db')] = 'Û'; $isohtml[hex('dc')] = 'Ü'; $isohtml[hex('dd')] = 'Ý'; $isohtml[hex('de')] = 'Þ'; $isohtml[hex('df')] = 'ß'; $isohtml[hex('e0')] = 'à'; $isohtml[hex('e1')] = 'á'; $isohtml[hex('e2')] = 'â'; $isohtml[hex('e3')] = 'ã'; $isohtml[hex('e4')] = 'ä'; $isohtml[hex('e5')] = 'å'; $isohtml[hex('e6')] = 'æ'; $isohtml[hex('e7')] = 'ç'; $isohtml[hex('e8')] = 'è'; $isohtml[hex('e9')] = 'é'; $isohtml[hex('ea')] = 'ê'; $isohtml[hex('eb')] = 'ë'; $isohtml[hex('ec')] = 'ì'; $isohtml[hex('ed')] = 'í'; $isohtml[hex('ee')] = 'î'; $isohtml[hex('ef')] = 'ï'; $isohtml[hex('f0')] = 'ð'; $isohtml[hex('f1')] = 'ñ'; $isohtml[hex('f2')] = 'ò'; $isohtml[hex('f3')] = 'ó'; $isohtml[hex('f4')] = 'ô'; $isohtml[hex('f5')] = 'õ'; $isohtml[hex('f6')] = 'ö'; $isohtml[hex('f7')] = '&DIVIS;'; $isohtml[hex('f8')] = 'ø'; $isohtml[hex('f9')] = 'ù'; $isohtml[hex('fa')] = 'ú'; $isohtml[hex('fb')] = 'û'; $isohtml[hex('fc')] = 'ü'; $isohtml[hex('fd')] = 'ý'; $isohtml[hex('fe')] = 'þ'; $isohtml[hex('ff')] = 'ÿ'; } local(@car) = split(//,$input); local($output); foreach (@car) { $output .= $isohtml[ord($_)]; } $output; }