[Koha-cvs] koha C4/Biblio.pm C4/Search.pm updater/updateda...

From: paul poulain
Subject: [Koha-cvs] koha C4/Biblio.pm C4/Search.pm updater/updateda...
Date: Wed, 02 May 2007 16:44:31 +0000

CVSROOT:        /sources/koha
Module name:    koha
Changes by:     paul poulain <tipaul>   07/05/02 16:44:31

Modified files:
        C4             : Biblio.pm Search.pm 
        updater        : updatedatabase 
        misc/migration_tools: rebuild_nozebra.pl 

Log message:
        NoZebra SQL index management : 
        * adding 3 subs in Biblio.pm
        - GetNoZebraIndexes, that get the index structure in a new 
systempreference (added with this commit)
        - _DelBiblioNoZebra, that retrieve all index entries for a biblio and 
remove in a variable the biblio reference
        - _AddBiblioNoZebra, that add index entries for a biblio.
        Note that the 2 _Add and _Del subs work only in a hash variable, to 
speed up things in case of a modif (ie : delete+add). The effective SQL update 
is done in the ModZebra sub (that existed before, and dealed with zebra index).
        I think the code has to be more deeply tested, but it works at least 


Index: C4/Biblio.pm
RCS file: /sources/koha/koha/C4/Biblio.pm,v
retrieving revision 1.201
retrieving revision 1.202
diff -u -b -r1.201 -r1.202
--- C4/Biblio.pm        27 Apr 2007 14:00:49 -0000      1.201
+++ C4/Biblio.pm        2 May 2007 16:44:31 -0000       1.202
@@ -33,7 +33,7 @@
 use vars qw($VERSION @ISA @EXPORT);
 # set the version for version checking
-$VERSION = do { my @v = '$Revision: 1.201 $' =~ /\d+/g; shift(@v).".".join( 
"_", map { sprintf "%03d", $_ } @v ); };
+$VERSION = do { my @v = '$Revision: 1.202 $' =~ /\d+/g; shift(@v).".".join( 
"_", map { sprintf "%03d", $_ } @v ); };
 @ISA = qw( Exporter );
@@ -113,6 +113,7 @@
+  &GetNoZebraIndexes
 =head1 NAME
@@ -607,7 +608,11 @@
     return $error if $error;
-    # Delete in Zebra
+    # Delete in Zebra. Be careful NOT to move this line after 
+    # for at least 2 reasons :
+    # - we need to read the biblio if NoZebra is set (to remove it from the 
+    # - if something goes wrong, the biblio may be deleted from Koha but not 
from zebra
+    #   and we would have no way to remove it (except manually in zebra, but I 
bet it would be very hard to handle the problem)
     # delete biblio from Koha tables and save in deletedbiblio
@@ -2705,97 +2710,254 @@
 # replaced by a zebraqueue table, that is filled with ModZebra to run.
 # the table is emptied by misc/cronjobs/zebraqueue_start.pl script
-my $sth=$dbh->prepare("insert into zebraqueue  (biblio_auth_number 
,server,operation) values(?,?,?)");
-#     my @Zconnbiblio;
-#     my $tried     = 0;
-#     my $recon     = 0;
-#     my $reconnect = 0;
-#     my $record;
-#     my $shadow;
-#   reconnect:
-#     $Zconnbiblio[0] = C4::Context->Zconn( $server, 0, 1 );
-#     if ( $server eq "biblioserver" ) {
-#         # it's unclear to me whether this should be in xml or MARC format
-#         # but it is clear it should be nabbed from zebra rather than from
-#         # the koha tables
-#         $record = GetMarcBiblio($biblionumber);
-#         $record = $record->as_xml_record() if $record;
-# #            warn "RECORD $biblionumber => ".$record;
-#         $shadow="biblioservershadow";
-#         #           warn "RECORD $biblionumber => ".$record;
-#         $shadow = "biblioservershadow";
-#     }
-#     elsif ( $server eq "authorityserver" ) {
-#         $record = C4::AuthoritiesMarc::XMLgetauthority( $dbh, $biblionumber 
-#         $shadow = "authorityservershadow";
-#     }    ## Add other servers as necessary
-#     my $Zpackage = $Zconnbiblio[0]->package();
-#     $Zpackage->option( action => $op );
-#     $Zpackage->option( record => $record );
-#   retry:
-#     $Zpackage->send("update");
-#     my $i;
-#     my $event;
-#     while ( ( $i = ZOOM::event( address@hidden ) ) != 0 ) {
-#         $event = $Zconnbiblio[0]->last_event();
-#         last if $event == ZOOM::Event::ZEND;
-#     }
-#     my ( $error, $errmsg, $addinfo, $diagset ) = $Zconnbiblio[0]->error_x();
-#     if ( $error == 10000 && $reconnect == 0 )
-#     {    ## This is serious ZEBRA server is not available -reconnect
-#         warn "problem with zebra server connection";
-#         $reconnect = 1;
-#         my $res = system('sc start "Z39.50 Server" 
-#         #warn "Trying to restart ZEBRA Server";
-#         #goto "reconnect";
-#     }
-#     elsif ( $error == 10007 && $tried < 2 )
-#     {    ## timeout --another 30 looonng seconds for this update
-#         $tried = $tried + 1;
-#         warn "warn: timeout, trying again";
-#         goto "retry";
-#     }
-#     elsif ( $error == 10004 && $recon == 0 ) {    ##Lost connection 
-#         $recon = 1;
-#         warn "error: reconnecting to zebra";
-#         goto "reconnect";
-#    # as a last resort, we save the data to the filesystem to be indexed in 
-#     }
-#     elsif ($error) {
-#         warn
-# "Error-$server   $op $biblionumber /errcode:, $error, /MSG:,$errmsg,$addinfo 
-#         $Zpackage->destroy();
-#         $Zconnbiblio[0]->destroy();
-#         ModZebrafiles( $dbh, $biblionumber, $record, $op, $server );
-#         return;
-#     }
-#     if ( C4::Context->$shadow ) {
-#         $Zpackage->send('commit');
-#         while ( ( $i = ZOOM::event( address@hidden ) ) != 0 ) {
-#             #waiting zebra to finish;
-#          }
-#     }
-#     $Zpackage->destroy();
+    if (C4::Context->preference("NoZebra")) {
+        # lock the nozebra table : we will read index lines, update them in 
Perl process
+        # and write everything in 1 transaction.
+        # lock the table to avoid someone else overwriting what we are doing
+        $dbh->do('LOCK TABLES nozebra WRITE,biblio WRITE,biblioitems WRITE, 
systempreferences WRITE');
+        my %result; # the result hash that will be builded by deletion / add, 
and written on mySQL at the end, to improve speed
+        my $record= GetMarcBiblio($biblionumber);
+        if ($op eq 'specialUpdate') {
+            # OK, we have to add or update the record
+            # 1st delete (virtually, in indexes) ...
+            %result = _DelBiblioNoZebra($biblionumber,$record);
+            # ... add the record
+            %result=_AddBiblioNoZebra($biblionumber,$record, %result);
+        } else {
+            # it's a deletion, delete the record...
+            %result=_DelBiblioNoZebra($biblionumber,$record);
+        }
+        # ok, now update the database...
+        my $sth = $dbh->prepare("UPDATE nozebra SET biblionumbers=? WHERE 
indexname=? AND value=?");
+        foreach my $key (keys %result) {
+            foreach my $index (keys %{$result{$key}}) {
+                $sth->execute($result{$key}->{$index},$key,$index);
+            }
+        }
+    $dbh->do('UNLOCK TABLES');
+    } else {
+    #
+    # we use zebra, just fill zebraqueue table
+    #
+    my $sth=$dbh->prepare("insert into zebraqueue  (biblio_auth_number 
,server,operation) values(?,?,?)");
+    $sth->execute($biblionumber,$server,$op);
+    $sth->finish;
+    }
+=head2 GetNoZebraIndexes
+sub GetNoZebraIndexes {
+    my $index = C4::Context->preference('NoZebraIndexes');
+    my %indexes;
+    foreach my $line (split /('|"),/,$index) {
+        $line =~ /(.*)=>(.*)/;
+        my $index = substr($1,1); # get the index, don't forget to remove 
initial ' or "
+        my $fields = $2;
+        $index =~ s/'|"| //g;
+        $fields =~ s/'|"| //g;
+        $indexes{$index}=$fields;
+    }
+    return %indexes;
+=head2 _DelBiblioNoZebra($biblionumber,$record);
+    function to delete a biblio in NoZebra indexes
+    This function does NOT delete anything in database : it reads all the 
indexes entries
+    that have to be deleted & delete them in the hash
+    The SQL part is done either :
+    - after the Add if we are modifying a biblio (delete + add again)
+    - immediatly after this sub if we are doing a true deletion.
+sub _DelBiblioNoZebra {
+    my ($biblionumber,$record)address@hidden;
+    # Get the indexes
+    my $dbh = C4::Context->dbh;
+    # Get the indexes
+    my %index=GetNoZebraIndexes;
+    # get title of the record (to store the 10 first letters with the index)
+    my ($titletag,$titlesubfield) = GetMarcFromKohaField('biblio.title');
+    my $title = lc($record->subfield($titletag,$titlesubfield));
+    my %result;
+    # remove blancks comma (that could cause problem when decoding the string 
for CQL retrieval) and regexp specific values
+    $title =~ s/ |,|;|\[|\]|\(|\)|\*|-|'|=//g;
+    # limit to 10 char, should be enough, and limit the DB size
+    $title = substr($title,0,10);
+    #parse each field
+    my $sth2=$dbh->prepare('SELECT biblionumbers FROM nozebra WHERE 
indexname=? AND value=?');
+    foreach my $field ($record->fields()) {
+        #parse each subfield
+        next if $field->tag <10;
+        foreach my $subfield ($field->subfields()) {
+            my $tag = $field->tag();
+            my $subfieldcode = $subfield->[0];
+            my $indexed=0;
+            # check each index to see if the subfield is stored somewhere
+            # otherwise, store it in __RAW__ index
+            foreach my $key (keys %index) {
+#                 warn "examining $key index : ".$index{$key}." for $tag 
+                if ($index{$key} =~ /$tag\*/ or $index{$key} =~ 
/$tag$subfieldcode/) {
+                    $indexed=1;
+                    my $line= lc $subfield->[1];
+                    # remove meaningless value in the field...
+                    $line =~ 
s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=/ /g;
+                    # ... and split in words
+                    foreach (split / /,$line) {
+                        next unless $_; # skip  empty values (multiple spaces)
+                        # if the entry is already here, do nothing, the 
biblionumber has already be removed
+                        unless ($result{$key}->{$_} =~ 
/$biblionumber,$title\-(\d);/) {
+                            # get the index value if it exist in the nozebra 
table and remove the entry, otherwise, do nothing
+                            $sth2->execute($key,$_);
+                            my $existing_biblionumbers = $sth2->fetchrow;
+                            # it exists
+                            if ($existing_biblionumbers) {
+                                warn " existing for $key $_: 
+                                $result{$key}->{$_} =$existing_biblionumbers;
+                                $result{$key}->{$_} =~ 
+                            }
+                        }
+                    }
+                }
+            }
+            # the subfield is not indexed, store it in __RAW__ index anyway
+            unless ($indexed) {
+                my $line= lc $subfield->[1];
+                $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=/ 
+                # ... and split in words
+                foreach (split / /,$line) {
+                    next unless $_; # skip  empty values (multiple spaces)
+                    # if the entry is already here, do nothing, the 
biblionumber has already be removed
+                    unless ($result{'__RAW__'}->{$_} =~ 
/$biblionumber,$title\-(\d);/) {
+                        # get the index value if it exist in the nozebra table 
and remove the entry, otherwise, do nothing
+                        $sth2->execute('__RAW__',$_);
+                        my $existing_biblionumbers = $sth2->fetchrow;
+                        # it exists
+                        if ($existing_biblionumbers) {
+                            warn " existing for __RAW__ $_ : 
+                            $result{'__RAW__'}->{$_} =$existing_biblionumbers;
+                            $result{'__RAW__'}->{$_} =~ 
+                        }
+                    }
+                }
+            }
+        }
+    }
+    return %result;
+=head2 _DelBiblioNoZebra($biblionumber,$record);
+    function to delete a biblio in NoZebra indexes
+sub _AddBiblioNoZebra {
+    my ($biblionumber,$record,%result)address@hidden;
+    my $dbh = C4::Context->dbh;
+    # Get the indexes
+    my %index=GetNoZebraIndexes;
+    # get title of the record (to store the 10 first letters with the index)
+    my ($titletag,$titlesubfield) = GetMarcFromKohaField('biblio.title');
+    my $title = lc($record->subfield($titletag,$titlesubfield));
+    # remove blancks comma (that could cause problem when decoding the string 
for CQL retrieval) and regexp specific values
+    $title =~ s/ |,|;|\[|\]|\(|\)|\*|-|'|=//g;
+    # limit to 10 char, should be enough, and limit the DB size
+    $title = substr($title,0,10);
+    #parse each field
+    my $sth2=$dbh->prepare('SELECT biblionumbers FROM nozebra WHERE 
indexname=? AND value=?');
+    foreach my $field ($record->fields()) {
+        #parse each subfield
+        next if $field->tag <10;
+        foreach my $subfield ($field->subfields()) {
+            my $tag = $field->tag();
+            my $subfieldcode = $subfield->[0];
+            my $indexed=0;
+            # check each index to see if the subfield is stored somewhere
+            # otherwise, store it in __RAW__ index
+            foreach my $key (keys %index) {
+#                 warn "examining $key index : ".$index{$key}." for $tag 
+                if ($index{$key} =~ /$tag\*/ or $index{$key} =~ 
/$tag$subfieldcode/) {
+                    $indexed=1;
+                    my $line= lc $subfield->[1];
+                    # remove meaningless value in the field...
+                    $line =~ 
s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=/ /g;
+                    # ... and split in words
+                    foreach (split / /,$line) {
+                        next unless $_; # skip  empty values (multiple spaces)
+                        # if the entry is already here, improve weight
+#                         warn "managing $_";
+                        if ($result{$key}->{$_} =~ 
/$biblionumber,$title\-(\d);/) {
+                            my $weight=$1+1;
+                            $result{$key}->{$_} =~ 
+                            $result{$key}->{$_} .= 
+                        } else {
+                            # get the value if it exist in the nozebra table, 
otherwise, create it
+                            $sth2->execute($key,$_);
+                            my $existing_biblionumbers = $sth2->fetchrow;
+                            # it exists
+                            if ($existing_biblionumbers) {
+                                warn" existing : $existing_biblionumbers";
+                                $result{$key}->{$_} =$existing_biblionumbers;
+                                my $weight=$1+1;
+                                $result{$key}->{$_} =~ 
+                                $result{$key}->{$_} .= 
+                            # create a new ligne for this entry
+                            } else {
+                                $dbh->do('INSERT INTO nozebra SET 
+                                $result{$key}->{$_}.="$biblionumber,$title-1;";
+                            }
+                        }
+                    }
+                }
+            }
+            # the subfield is not indexed, store it in __RAW__ index anyway
+            unless ($indexed) {
+                my $line= lc $subfield->[1];
+                $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=/ 
+                # ... and split in words
+                foreach (split / /,$line) {
+                    next unless $_; # skip  empty values (multiple spaces)
+                    # if the entry is already here, improve weight
+                    if ($result{'__RAW__'}->{$_} =~ 
/$biblionumber,$title\-(\d);/) {
+                        my $weight=$1+1;
+                        $result{'__RAW__'}->{$_} =~ 
+                        $result{'__RAW__'}->{$_} .= 
+                    } else {
+                        # get the value if it exist in the nozebra table, 
otherwise, create it
+                        $sth2->execute('__RAW__',$_);
+                        my $existing_biblionumbers = $sth2->fetchrow;
+                        # it exists
+                        if ($existing_biblionumbers) {
+                            $result{'__RAW__'}->{$_} =$existing_biblionumbers;
+                            my $weight=$1+1;
+                            $result{'__RAW__'}->{$_} =~ 
+                            $result{'__RAW__'}->{$_} .= 
+                        # create a new ligne for this entry
+                        } else {
+                            $dbh->do('INSERT INTO nozebra SET 
+                        }
+                    }
+                }
+            }
+        }
+    }
+    return %result;
 =head2 MARCitemchange
 =over 4
@@ -3505,7 +3667,7 @@
 # deal with UNIMARC field 100 (encoding) : create it if needed & set encoding 
to unicode
     if ( $encoding eq "UNIMARC" ) {
         my $string;
-        if ( $record->subfield( 100, "a" ) ) {
+        if ( length($record->subfield( 100, "a" )) == 35 ) {
             $string = $record->subfield( 100, "a" );
             my $f100 = $record->field(100);
@@ -3689,8 +3851,17 @@
-# $Id: Biblio.pm,v 1.201 2007/04/27 14:00:49 hdl Exp $
+# $Id: Biblio.pm,v 1.202 2007/05/02 16:44:31 tipaul Exp $
 # $Log: Biblio.pm,v $
+# Revision 1.202  2007/05/02 16:44:31  tipaul
+# NoZebra SQL index management :
+# * adding 3 subs in Biblio.pm
+# - GetNoZebraIndexes, that get the index structure in a new systempreference 
(added with this commit)
+# - _DelBiblioNoZebra, that retrieve all index entries for a biblio and remove 
in a variable the biblio reference
+# - _AddBiblioNoZebra, that add index entries for a biblio.
+# Note that the 2 _Add and _Del subs work only in a hash variable, to speed up 
things in case of a modif (ie : delete+add). The effective SQL update is done 
in the ModZebra sub (that existed before, and dealed with zebra index).
+# I think the code has to be more deeply tested, but it works at least 
 # Revision 1.201  2007/04/27 14:00:49  hdl
 # Removing $dbh from GetMarcFromKohaField (dbh is not used in this function.)

Index: C4/Search.pm
RCS file: /sources/koha/koha/C4/Search.pm,v
retrieving revision 1.134
retrieving revision 1.135
diff -u -b -r1.134 -r1.135
--- C4/Search.pm        2 May 2007 11:57:11 -0000       1.134
+++ C4/Search.pm        2 May 2007 16:44:31 -0000       1.135
@@ -25,7 +25,7 @@
 # set the version for version checking
-$VERSION = do { my @v = '$Revision: 1.134 $' =~ /\d+/g;
+$VERSION = do { my @v = '$Revision: 1.135 $' =~ /\d+/g;
     shift(@v) . "." . join( "_", map { sprintf "%03d", $_ } @v );
@@ -1454,26 +1454,21 @@
             $title =~ /(.*)-(\d)/;
             # get weight 
             my $ranking =$2;
-            # hint : the result is sorted by title.biblionumber because we can 
have X biblios with the same title
-            # and we don't want to get only 1 result for each of them !!!
             # note that we + the ranking because ranking is calculated on 
weight of EACH term requested.
             # if we ask for "two towers", and "two" has weight 2 in biblio N, 
and "towers" has weight 4 in biblio N
             # biblio N has ranking = 6
-            $count_ranking{$biblionumber}=0 unless 
             $count_ranking{$biblionumber} =+ $ranking;
         # build the result by "inverting" the count_ranking hash
         # hing : as usual, we don't order by ranking only, to avoid having 
only 1 result for each rank. We build an hash on concat(ranking,biblionumber) 
 #         warn "counting";
         foreach (keys %count_ranking) {
-            warn "$_ =".sprintf("%10d",$count_ranking{$_}).'-'.$_;
             $result{sprintf("%10d",$count_ranking{$_}).'-'.$_} = $_;
         # sort the hash and return the same structure as GetRecords (Zebra 
         my $result_hash;
         my $numbers=0;
             foreach my $key (sort {$b <=> $a} (keys %result)) {
-            warn "KEY : $key = ".$result{$key};
                 $result_hash->{'RECORDS'}[$numbers++] = $result{$key};
         # for the requested page, replace biblionumber by the complete record

Index: updater/updatedatabase
RCS file: /sources/koha/koha/updater/updatedatabase,v
retrieving revision 1.162
retrieving revision 1.163
diff -u -b -r1.162 -r1.163
--- updater/updatedatabase      30 Apr 2007 16:16:50 -0000      1.162
+++ updater/updatedatabase      2 May 2007 16:44:31 -0000       1.163
@@ -1,6 +1,6 @@
-# $Id: updatedatabase,v 1.162 2007/04/30 16:16:50 tipaul Exp $
+# $Id: updatedatabase,v 1.163 2007/05/02 16:44:31 tipaul Exp $
 # Database Updater
 # This script checks for required updates to the database.
@@ -530,6 +530,14 @@
             explanation         => 'Active this if you want NOT to use zebra 
(large libraries should avoid this parameters)',
             type                => 'YesNo',
+        {
+            uniquefieldrequired => 'variable',
+            variable            => 'NoZebraIndexes',
+            value               => '0',
+            forceupdate         => {'explanation' => 1, 'type' => 1},
+            explanation         => "Enter a specific hash for NoZebra indexes. 
Enter : 'indexname' => '100a,245a,500*','index2' => '...'",
+            type                => 'Free',
+        },
     userflags => [
@@ -1993,6 +2001,15 @@
 # $Log: updatedatabase,v $
+# Revision 1.163  2007/05/02 16:44:31  tipaul
+# NoZebra SQL index management :
+# * adding 3 subs in Biblio.pm
+# - GetNoZebraIndexes, that get the index structure in a new systempreference 
(added with this commit)
+# - _DelBiblioNoZebra, that retrieve all index entries for a biblio and remove 
in a variable the biblio reference
+# - _AddBiblioNoZebra, that add index entries for a biblio.
+# Note that the 2 _Add and _Del subs work only in a hash variable, to speed up 
things in case of a modif (ie : delete+add). The effective SQL update is done 
in the ModZebra sub (that existed before, and dealed with zebra index).
+# I think the code has to be more deeply tested, but it works at least 
 # Revision 1.162  2007/04/30 16:16:50  tipaul
 # bugfix for updatedatabase : when there is no default value (NULL fields) + 
removing bibliothesaurus table+adding NoZebra systempref (False by default)

Index: misc/migration_tools/rebuild_nozebra.pl
RCS file: /sources/koha/koha/misc/migration_tools/rebuild_nozebra.pl,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -b -r1.2 -r1.3
--- misc/migration_tools/rebuild_nozebra.pl     2 May 2007 11:57:11 -0000       
+++ misc/migration_tools/rebuild_nozebra.pl     2 May 2007 16:44:31 -0000       
@@ -14,7 +14,7 @@
 $|=1; # flushes output
 # limit for database dumping
-my $limit;# = "LIMIT 1000";
+my $limit = "LIMIT 100";
 my $directory;
 my $skip_export;
 my $keep_export;
@@ -47,22 +47,7 @@
 my $i=0;
 my %result;
-my %index = (
-    'title' => 
-    'author' 
-    'isbn' => '010a',
-    'issn' => '011a',
-    'biblionumber' =>'0909',
-    'itemtype' => '200b',
-    'language' => '010a',
-    'publisher' => '210x',
-    'date' => '210d',
-    'note' => 
-    'Koha-Auth-Number' => '6009,6019,6029,6039,6049,6059,6069,6109',
-    'subject' => '600*,601*,606*,610*',
-    'dewey' => '676a',
-    'host-item' => '995a,995c',
-    );
+my %index = GetNoZebraIndexes();
 while (my ($biblionumber) = $sth->fetchrow) {
@@ -71,14 +56,11 @@
     my $record = GetMarcBiblio($biblionumber);
     # get title of the record (to store the 10 first letters with the index)
-    my $title;
-    if (C4::Context->preference('marcflavour') eq 'UNIMARC') {
-        $title = lc($record->subfield('200','a'));
-    } else {
-        $title = lc($record->subfield('245','a'));
-    }
+    my ($titletag,$titlesubfield) = GetMarcFromKohaField('biblio.title');
+    my $title = lc($record->subfield($titletag,$titlesubfield));
     # remove blancks comma (that could cause problem when decoding the string 
for CQL retrieval) and regexp specific values
-    $title =~ s/ |,|;|\[|\]|\(|\)|\*//g;
+    $title =~ s/ |,|;|\[|\]|\(|\)|\*|-|'|=//g;
     # limit to 10 char, should be enough, and limit the DB size
     $title = substr($title,0,10);
     #parse each field
@@ -92,16 +74,20 @@
             # check each index to see if the subfield is stored somewhere
             # otherwise, store it in __RAW__ index
             foreach my $key (keys %index) {
-                if ($index{$key} =~ /$tag\*/ or $index{$key} =~ 
/$tag$subfield/) {
+                if ($index{$key} =~ /$tag\*/ or $index{$key} =~ 
/$tag$subfieldcode/) {
                     my $line= lc $subfield->[1];
-                    $line =~ 
s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\// /g;
+                    # remove meaningless value in the field...
+                    $line =~ 
s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=/ /g;
+                    # ... and split in words
                     foreach (split / /,$line) {
-                        # see if the entry is already here
+                        next unless $_; # skip  empty values (multiple spaces)
+                        # if the entry is already here, improve weight
                         if ($result{$key}->{$_} =~ 
/$biblionumber,$title\-(\d);/) {
                             my $weight=$1+1;
                             $result{$key}->{$_} =~ 
                             $result{$key}->{$_} .= 
+                        # otherwise, create it, with weight=1
                         } else {
@@ -111,8 +97,9 @@
             # the subfield is not indexed, store it in __RAW__ index anyway
             unless ($indexed) {
                 my $line= lc $subfield->[1];
-                $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\// 
+                $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=/ 
                 foreach (split / /,$line) {
+                        next unless $_;
 #                     warn $record->as_formatted."$_ =>".$title;
                         if ($result{__RAW__}->{$_} =~ 
/$biblionumber,$title\-(\d);/) {
                             my $weight=$1+1;
@@ -131,8 +118,8 @@
 foreach my $key (keys %result) {
     foreach my $index (keys %{$result{$key}}) {
-        if (length($result{$key}->{$index}) >40000) {
-            print length($result{$key}->{$index})." for $key / $index";
+        if (length($result{$key}->{$index}) > 40000) {
+            print length($result{$key}->{$index})."\n for $key / $index\n";

