koha-cvs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Koha-cvs] koha/misc build_marc_Tword.pl build_marc_word.pl [rel_3_0]


From: paul poulain
Subject: [Koha-cvs] koha/misc build_marc_Tword.pl build_marc_word.pl [rel_3_0]
Date: Fri, 17 Nov 2006 12:56:37 +0000

CVSROOT:        /sources/koha
Module name:    koha
Branch:         rel_3_0
Changes by:     paul poulain <tipaul>   06/11/17 12:56:37

Removed files:
        misc           : build_marc_Tword.pl build_marc_word.pl 

Log message:
        removing useless scripts

CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/koha/misc/build_marc_Tword.pl?cvsroot=koha&only_with_tag=rel_3_0&r1=1.3&r2=0
http://cvs.savannah.gnu.org/viewcvs/koha/misc/build_marc_word.pl?cvsroot=koha&only_with_tag=rel_3_0&r1=1.1&r2=0

Patches:
Index: build_marc_Tword.pl
===================================================================
RCS file: build_marc_Tword.pl
diff -N build_marc_Tword.pl
--- build_marc_Tword.pl 1 Jun 2005 18:55:08 -0000       1.3
+++ /dev/null   1 Jan 1970 00:00:00 -0000
@@ -1,129 +0,0 @@
-#!/usr/bin/perl -w
-#-----------------------------------
-# Script Name: build_marc_Tword.pl
-# Script Version: 0.1.0
-# Date:  2004/06/05
-
-# script to build a marc_Tword table.
-# create the table :
-# CREATE TABLE `marc_Tword` (
-#  `word` varchar(80) NOT NULL default '',
-#  `usedin` text NOT NULL,
-#  `tagsubfield` varchar(4) NOT NULL default '',
-#  PRIMARY KEY  (`word`,`tagsubfield`)
-#) TYPE=MyISAM;
-# just to test the idea of a reversed index searching.
-# reversed index for searchs on Title.
-# the marc_Tword table contains for each word & marc field/subfield, the list 
of biblios using it, with the title
-# reminder : the inverted index is only done to search on a "contain". For a 
"=" or "start by", the marc_subfield_table is perfect & correctly indexed.
-# if this POC becomes more than a POC, then I think we will have to build 1 
table for each sorting (marc_Tword for title, Aword for author, Cword for 
callnumber...)
-
-# FIXME :
-# * indexes empty words too (it's just a proof of concept)
-# * maybe it would be OK to store only 20 char of the title.
-
-use strict;
-use locale;
-use C4::Context;
-use C4::Biblio;
-my $dbh=C4::Context->dbh;
-use Time::HiRes qw(gettimeofday);
-
-# fields & subfields to ignore
-# in real situation, we should add a marc constraint on this.
-# ideally, we should not inde isbn, as every would be different, so it makes 
the table very big.
-# but in this case we have to find a way to automatically search "isbn = XXX" 
in marc_subfield_table
-
-my %ignore_list = (
-       '001' =>1,
-       '010b'=>1,
-       '0909' => 1,
-       '090a' => 1,
-       '100' => 1,
-       '105' => 1,
-       '6069' => 1,
-       '7009' => 1,
-       '7019' => 1,
-       '7109' => 1,
-       '7129' => 1,
-       '9959' => 1,
-);
-
-my $starttime = gettimeofday;
-
-$dbh->do("delete from marc_Tword");
-
-# parse every line
-my $query="SELECT 
biblio.biblionumber,tag,subfieldcode,subfieldvalue,biblio.title FROM 
marc_subfield_table left join marc_biblio on 
marc_biblio.bibid=marc_subfield_table.bibid left join biblio on 
marc_biblio.biblionumber=biblio.biblionumber where tag=?";
-my $sth=$dbh->prepare($query);
-
-for (my $looptag=0;$looptag<=999;$looptag++) {
-       print "******** SELECTING ".(sprintf "%03s",$looptag)."\n";
-       $sth->execute(sprintf "%03s",$looptag);
-       print "******** DONE \n";
-       $|=1; # flushes output
-       
-       my $sthT=$dbh->prepare("select usedin from marc_Tword where 
tagsubfield=? and word=?");
-       my $updateT=$dbh->prepare("update marc_Tword set usedin=? where 
tagsubfield=? and word=?");
-       my $insertT=$dbh->prepare("insert into marc_Tword 
(tagsubfield,word,usedin) values (?,?,?)");
-       my $i=0;
-       my $timeneeded;
-       # 1st version, slower, but less RAM consumming
-       # while (my ($biblionumber, $tag, $subfieldcode, $subfieldvalue, 
$title) = $sth->fetchrow) {
-       #       next if $ignore_list{"$tag.$subfieldcode"};
-       #     $subfieldvalue =~ 
s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g;
-       #       # remove useless chars in the title.
-       #     $title =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g;
-       #     my @words = split / /, $subfieldvalue;
-       #       # and retrieve the reversed entry
-       #       foreach my $word (@words) {
-       #               $sthT->execute($tag.$subfieldcode,$word);
-       #               if (my ($usedin) = $sthT->fetchrow) {
-       #                       # add the field & save it once again.
-       #                       $usedin.=",$biblionumber-$title";
-       #                       
$updateT->execute($usedin,$tag.$subfieldcode,$word);
-       #               } else {
-       #                       
$insertT->execute($tag.$subfieldcode,$word,",$title-$biblionumber");
-       #               }
-       #       }
-       #       $timeneeded = gettimeofday - $starttime unless ($i % 100);
-       #       print "$i in $timeneeded s\n" unless ($i % 100);
-       #       print ".";
-       #       $i++;
-       # }
-       
-       # 2nd version : faster (about 100 times !), bug maybe too much RAM 
consumming...
-       my %largehash;
-#      print "READING\n";
-       $timeneeded = gettimeofday - $starttime unless ($i % 30000);
-       print "READING $timeneeded s\n";
-       while (my ($biblionumber, $tag, $subfieldcode, $subfieldvalue, $title) 
= $sth->fetchrow) {
-               next unless $subfieldvalue;
-               next if $ignore_list{$tag.$subfieldcode};
-               $subfieldvalue =~ 
s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g if $subfieldvalue;
-               # remove useless chars in the title.
-               $title =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g 
if $title;
-               my @words = split / /, $subfieldvalue;
-               # and retrieve the reversed entry
-               foreach my $word (@words) {
-                       my $localkey = $tag.$subfieldcode.'|'.uc($word);
-                       
$largehash{$localkey}.=",".substr($title,0,15)."-$biblionumber";
-               }
-               $timeneeded = gettimeofday - $starttime unless ($i % 30000);
-               print "$i in $timeneeded s\n" unless ($i % 30000);
-               print "." unless ($i % 500);
-               $i++;
-       }
-       $i=0;
-       print "WRITING\n";
-       foreach my $k (keys %largehash) {
-               $k =~ /(.*)\|(.*)/;
-               $insertT->execute($1,$2,$largehash{$k});
-               $timeneeded = gettimeofday - $starttime unless ($i % 30000);
-               print "$i in $timeneeded s\n" unless ($i % 30000);
-               print "." unless ($i % 500);
-               $i++;
-       }
-}
-
-$dbh->disconnect();

Index: build_marc_word.pl
===================================================================
RCS file: build_marc_word.pl
diff -N build_marc_word.pl
--- build_marc_word.pl  11 Jun 2004 15:07:48 -0000      1.1
+++ /dev/null   1 Jan 1970 00:00:00 -0000
@@ -1,114 +0,0 @@
-#!/usr/bin/perl -w
-#-----------------------------------
-# Script Name: build_marc_word.pl
-# Script Version: 0.1.0
-# Date:  2004/06/05
-# Author:  Joshua Ferraro [jmf at kados dot org]
-# Description: This script builds a new marc_word
-#  table with a reduced number of tags (only those
-#  tags that should be searched) allowing for
-#  faster and more accurate searching when used
-#  with the SearchMarc routines.  Make sure that
-#  the MARCaddword routine in Biblio.pm will index
-#  characters >= 1 char; otherwise, searches like
-#  "O'brian, Patrick" will fail as the search 
-#  routines will seperate that query into "o", 
-#  "brian", and "patrick".  (If "o" is not in the
-#  database the search will fail)
-# Usage: build_marc_word.pl
-# Revision History:
-#    0.1.0  2004/06/11:  first working version.
-#                       Thanks to Chris Cormack
-#                       for helping with the $data object
-#                       and Stephen Hedges for providing
-#                       the list of MARC tags.
-# FixMe:
-#   *Should add a few parameters like 'delete from
-#    marc_word' or make script ask user whether to
-#    perform that task ...
-#   *Add a 'status' report as the data is loaded ... 
-#-----------------------------------
-use lib '/usr/local/koha/intranet/modules/';
-use strict;
-use C4::Context;
-use C4::Biblio;
-my $dbh=C4::Context->dbh;
-
-#Here is where you name the tags that you wish to index.  If you
-# are using MARC21 this set of default tags should be fine but you
-# may need to add holdings tags specific to your library (e.g., holding
-# branch for Nelsonville is 942k but that may not be the case for your
-# library).
-my @tags=(
-
-#Tag documentation from http://lcweb.loc.gov/marc/bibliographic/ecbdhome.html
-
-"020a", # INTERNATIONAL STANDARD BOOK NUMBER
-"022a", # INTERNATIONAL STANDARD SERIAL NUMBER
-"100a",        # MAIN ENTRY--PERSONAL NAME
-"110a",        # MAIN ENTRY--CORPORATE NAME
-"110b",        #   Subordinate unit
-"110c",        #   Location of meeting
-"111a", # MAIN ENTRY--MEETING NAME
-"111c", #   Location of meeting
-"130a", # MAIN ENTRY--UNIFORM TITLE 
-"240a", # UNIFORM TITLE 
-"245a", # TITLE STATEMENT
-"245b", #   Remainder of title
-"245c", #   Statement of responsibility, etc.
-"245p", #   Name of part/section of a work
-"246a", # VARYING FORM OF TITLE
-"246b", #   Remainder of title
-"260b", # PUBLICATION, DISTRIBUTION, ETC. (IMPRINT)
-"440a", # SERIES STATEMENT/ADDED ENTRY--TITLE
-"440p", #   Name of part/section of a work
-"500a", # GENERAL NOTE
-"505t", # FORMATTED CONTENTS NOTE (t is Title)
-"511a", # PARTICIPANT OR PERFORMER NOTE
-"520a", # SUMMARY, ETC.
-"534a", # ORIGINAL VERSION NOTE 
-"534k", #   Key title of original
-"534t", #   Title statement of original
-"586a", # AWARDS NOTE
-"600a", # SUBJECT ADDED ENTRY--PERSONAL NAME 
-"610a", # SUBJECT ADDED ENTRY--CORPORATE NAME
-"611a", # SUBJECT ADDED ENTRY--MEETING NAME
-"630a", # SUBJECT ADDED ENTRY--UNIFORM TITLE
-"650a", # SUBJECT ADDED ENTRY--TOPICAL TERM
-"651a", # SUBJECT ADDED ENTRY--GEOGRAPHIC NAME
-"700a", # ADDED ENTRY--PERSONAL NAME
-"710a", # ADDED ENTRY--CORPORATE NAME
-"711a", # ADDED ENTRY--MEETING NAME
-"720a", # ADDED ENTRY--UNCONTROLLED NAME
-"730a", # ADDED ENTRY--UNIFORM TITLE
-"740a", # ADDED ENTRY--UNCONTROLLED RELATED/ANALYTICAL TITLE
-"752a", # ADDED ENTRY--HIERARCHICAL PLACE NAME
-"800a", # SERIES ADDED ENTRY--PERSONAL NAME
-"810a", # SERIES ADDED ENTRY--CORPORATE NAME
-"811a", # SERIES ADDED ENTRY--MEETING NAME
-"830a", # SERIES ADDED ENTRY--UNIFORM TITLE
-"942k"  # Holdings Branch ?? Unique to NPL??
-);
-
-#note that subfieldcode in marc_subfield_table is subfieldid in marc_word ... 
even
-#though there is another subfieldid in marc_subfield_table--very confusing 
naming conventions!
-
-#For each tag we run a search to find the necessary data for building the 
marc_word table
-foreach my $this_tagid(@tags) {
-       my $query="SELECT 
bibid,tag,tagorder,subfieldcode,subfieldorder,subfieldvalue FROM 
marc_subfield_table WHERE tag=? AND subfieldcode=?";
-       my $sth=$dbh->prepare($query);
-
-       my ($tag, $subfieldid);
-
-#split the tag into tag, subfield
-       if ($this_tagid =~ s/(\D+)//) {
-               $subfieldid = $1;
-               $tag = $this_tagid;
-       }
-#Then we pass this information on to MARCaddword in Biblio.pm to actually 
perform the import into marc_word
-       $sth->execute($tag, $subfieldid);
-       while (my $data=$sth->fetchrow_hashref()){
-               
MARCaddword($dbh,$data->{'bibid'},$data->{'tag'},$data->{'tagorder'},$data->{'subfieldcode'},$data->{'subfieldorder'},$data->{'subfieldvalue'});
-       }
-}
-$dbh->disconnect();




reply via email to

[Prev in Thread] Current Thread [Next in Thread]