koha-cvs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Koha-cvs] CVS: koha/misc build_marc_Tword.pl,1.1,1.2


From: Paul POULAIN
Subject: [Koha-cvs] CVS: koha/misc build_marc_Tword.pl,1.1,1.2
Date: Wed, 01 Jun 2005 11:47:41 -0700

Update of /cvsroot/koha/koha/misc
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv12257/misc

Modified Files:
        build_marc_Tword.pl 
Log Message:
new version, doing tag by tag. Should be slower, but requires a lot less memory

Index: build_marc_Tword.pl
===================================================================
RCS file: /cvsroot/koha/koha/misc/build_marc_Tword.pl,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -r1.1 -r1.2
*** build_marc_Tword.pl 27 May 2005 09:30:24 -0000      1.1
--- build_marc_Tword.pl 1 Jun 2005 18:47:38 -0000       1.2
***************
*** 55,124 ****
  
  # parse every line
! my $query="SELECT 
biblio.biblionumber,tag,subfieldcode,subfieldvalue,biblio.title FROM 
marc_subfield_table left join marc_biblio on 
marc_biblio.bibid=marc_subfield_table.bibid left join biblio on 
marc_biblio.biblionumber=biblio.biblionumber";
  my $sth=$dbh->prepare($query);
  
! print "******** SELECTING \n";
! $sth->execute;
! print "******** DONE \n";
! $|=1; # flushes output
! 
! my $sthT=$dbh->prepare("select usedin from marc_Tword where tagsubfield=? and 
word=?");
! my $updateT=$dbh->prepare("update marc_Tword set usedin=? where tagsubfield=? 
and word=?");
! my $insertT=$dbh->prepare("insert into marc_Tword (tagsubfield,word,usedin) 
values (?,?,?)");
! my $i=0;
! my $timeneeded;
! # 1st version, slower, but less RAM consumming
! # while (my ($biblionumber, $tag, $subfieldcode, $subfieldvalue, $title) = 
$sth->fetchrow) {
! #     next if $ignore_list{"$tag.$subfieldcode"};
! #     $subfieldvalue =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g;
! #     # remove useless chars in the title.
! #     $title =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g;
! #     my @words = split / /, $subfieldvalue;
! #     # and retrieve the reversed entry
! #     foreach my $word (@words) {
! #             $sthT->execute($tag.$subfieldcode,$word);
! #             if (my ($usedin) = $sthT->fetchrow) {
! #                     # add the field & save it once again.
! #                     $usedin.=",$biblionumber-$title";
! #                     $updateT->execute($usedin,$tag.$subfieldcode,$word);
! #             } else {
! #                     
$insertT->execute($tag.$subfieldcode,$word,",$title-$biblionumber");
! #             }
! #     }
! #     $timeneeded = gettimeofday - $starttime unless ($i % 100);
! #     print "$i in $timeneeded s\n" unless ($i % 100);
! #     print ".";
! #     $i++;
! # }
! 
! # 2nd version : faster (about 100 times !), bug maybe too much RAM 
consumming...
! my %largehash;
! print "READING\n";
! while (my ($biblionumber, $tag, $subfieldcode, $subfieldvalue, $title) = 
$sth->fetchrow) {
!       next unless $subfieldvalue;
!       next if $ignore_list{$tag.$subfieldcode};
!     $subfieldvalue =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g;
!       # remove useless chars in the title.
!     $title =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g;
!     my @words = split / /, $subfieldvalue;
!       # and retrieve the reversed entry
!       foreach my $word (@words) {
!               my $localkey = $tag.$subfieldcode.'|'.uc($word);
!               $largehash{$localkey}.=",$title-$biblionumber";
        }
-       $timeneeded = gettimeofday - $starttime unless ($i % 30000);
-       print "$i in $timeneeded s\n" unless ($i % 30000);
-       print "." unless ($i % 500);
-       $i++;
- }
- $i=0;
- print "WRITING\n";
- foreach my $k (keys %largehash) {
-       $k =~ /(.*)\|(.*)/;
-       $insertT->execute($1,$2,$largehash{$k});
-       $timeneeded = gettimeofday - $starttime unless ($i % 30000);
-       print "$i in $timeneeded s\n" unless ($i % 30000);
-       print "." unless ($i % 500);
-       $i++;
  }
  
--- 55,126 ----
  
  # parse every line
! my $query="SELECT 
biblio.biblionumber,tag,subfieldcode,subfieldvalue,biblio.title FROM 
marc_subfield_table left join marc_biblio on 
marc_biblio.bibid=marc_subfield_table.bibid left join biblio on 
marc_biblio.biblionumber=biblio.biblionumber and tag=?";
  my $sth=$dbh->prepare($query);
  
! for (my $looptag=0;$looptag<=999;$looptag++) {
!       print "******** SELECTING ".(sprintf "%03s",$looptag)."\n";
!       $sth->execute(sprintf "%03s",$looptag);
!       print "******** DONE \n";
!       $|=1; # flushes output
!       
!       my $sthT=$dbh->prepare("select usedin from marc_Tword where 
tagsubfield=? and word=?");
!       my $updateT=$dbh->prepare("update marc_Tword set usedin=? where 
tagsubfield=? and word=?");
!       my $insertT=$dbh->prepare("insert into marc_Tword 
(tagsubfield,word,usedin) values (?,?,?)");
!       my $i=0;
!       my $timeneeded;
!       # 1st version, slower, but less RAM consumming
!       # while (my ($biblionumber, $tag, $subfieldcode, $subfieldvalue, 
$title) = $sth->fetchrow) {
!       #       next if $ignore_list{"$tag.$subfieldcode"};
!       #     $subfieldvalue =~ 
s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g;
!       #       # remove useless chars in the title.
!       #     $title =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g;
!       #     my @words = split / /, $subfieldvalue;
!       #       # and retrieve the reversed entry
!       #       foreach my $word (@words) {
!       #               $sthT->execute($tag.$subfieldcode,$word);
!       #               if (my ($usedin) = $sthT->fetchrow) {
!       #                       # add the field & save it once again.
!       #                       $usedin.=",$biblionumber-$title";
!       #                       
$updateT->execute($usedin,$tag.$subfieldcode,$word);
!       #               } else {
!       #                       
$insertT->execute($tag.$subfieldcode,$word,",$title-$biblionumber");
!       #               }
!       #       }
!       #       $timeneeded = gettimeofday - $starttime unless ($i % 100);
!       #       print "$i in $timeneeded s\n" unless ($i % 100);
!       #       print ".";
!       #       $i++;
!       # }
!       
!       # 2nd version : faster (about 100 times !), bug maybe too much RAM 
consumming...
!       my %largehash;
!       print "READING\n";
!       while (my ($biblionumber, $tag, $subfieldcode, $subfieldvalue, $title) 
= $sth->fetchrow) {
!               next unless $subfieldvalue;
!               next if $ignore_list{$tag.$subfieldcode};
!               $subfieldvalue =~ 
s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g if $subfieldvalue;
!               # remove useless chars in the title.
!               $title =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g 
if $title;
!               my @words = split / /, $subfieldvalue;
!               # and retrieve the reversed entry
!               foreach my $word (@words) {
!                       my $localkey = $tag.$subfieldcode.'|'.uc($word);
!                       
$largehash{$localkey}.=",".substr($title,0,15)."-$biblionumber";
!               }
!               $timeneeded = gettimeofday - $starttime unless ($i % 30000);
!               print "$i in $timeneeded s\n" unless ($i % 30000);
!               print "." unless ($i % 500);
!               $i++;
!       }
!       $i=0;
!       print "WRITING\n";
!       foreach my $k (keys %largehash) {
!               $k =~ /(.*)\|(.*)/;
!               $insertT->execute($1,$2,$largehash{$k});
!               $timeneeded = gettimeofday - $starttime unless ($i % 30000);
!               print "$i in $timeneeded s\n" unless ($i % 30000);
!               print "." unless ($i % 500);
!               $i++;
        }
  }
  




reply via email to

[Prev in Thread] Current Thread [Next in Thread]