#!/usr/bin/perl -w use strict; use File::Find; use DB_File; $|++; my $WEB = "/var/www/html/"; my $HOST = "http://www.easyya.com/"; # HOST and webdir must keep the same trailing ############################################################### my %db; my $fileno =0; unlink("search_index.db"); dbmopen(%db,"search_index.db",0644) or die "dbmopen: $!"; finddepth(\&wanted,$WEB); sub wanted { my $filename = $File::Find::name; return if -l $filename; return unless /\.(html|txt)$/; print "indexing No.$fileno $filename\n"; my $parser = MyParser->new; $parser->parse_file($filename); # store link and title of the file into database my $title =$parser->{title}; $title = $_ if ( !(defined $title) or $title=~ /^\s+$/); $filename =~ s/$WEB/$HOST/; $db{-$fileno} = "$title"; # extract words from file & store in database my $last = ""; my @words = $parser->{TEXT}=~ /\w+/g; foreach my $word (sort @words) { next if lc $word eq $last; $last = $word; $db{$word} .= "-$fileno"; } $fileno++; } BEGIN{ package MyParser; require HTML::Parser; @MyParser::ISA = qw(HTML::Parser); sub start { my ($self,$tag) = @_; $self->{TITLE} = "defined" if ($tag eq 'title'); } sub end { my ($self,$tag) = @_; undef $self->{TITLE} if ($tag eq 'title' && $self->{TITLE}); } sub text { my ($self,$text) = @_; $self->{title} = $text if ($self->{TITLE}); $self->{TEXT} .= $text; } } untie %db; 1; __END__