[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Koha-cvs] koha/misc/migration_tools bulkmarcimport.pl [rel_2_2]
From: |
Thomas D |
Subject: |
[Koha-cvs] koha/misc/migration_tools bulkmarcimport.pl [rel_2_2] |
Date: |
Wed, 26 Apr 2006 07:36:12 +0000 |
CVSROOT: /sources/koha
Module name: koha
Branch: rel_2_2
Changes by: Thomas D <address@hidden> 06/04/26 07:36:12
Modified files:
misc/migration_tools: bulkmarcimport.pl
Log message:
MARC8 to UTF-8 support added without XML problems. Rationalised the
variable
name used for designating MARC flavour to avoid confusion with
character set
encoding.
CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/koha/koha/misc/migration_tools/bulkmarcimport.pl.diff?only_with_tag=rel_2_2&tr1=1.1.2.2&tr2=1.1.2.3&r1=text&r2=text
Patches:
Index: koha/misc/migration_tools/bulkmarcimport.pl
diff -u koha/misc/migration_tools/bulkmarcimport.pl:1.1.2.2
koha/misc/migration_tools/bulkmarcimport.pl:1.1.2.3
--- koha/misc/migration_tools/bulkmarcimport.pl:1.1.2.2 Fri Mar 10 04:11:23 2006
+++ koha/misc/migration_tools/bulkmarcimport.pl Wed Apr 26 07:36:12 2006
@@ -2,28 +2,139 @@
# small script that import an iso2709 file into koha 2.0
use strict;
+use warnings;
# Koha modules used
use MARC::File::USMARC;
+# Uncomment the line below and use MARC::File::XML again when it works better.
+# -- thd
+# use MARC::File::XML;
use MARC::Record;
use MARC::Batch;
+use MARC::Charset;
use C4::Context;
use C4::Biblio;
use Time::HiRes qw(gettimeofday);
-
use Getopt::Long;
+binmode(STDOUT, ":utf8");
+
my ( $input_marc_file, $number) = ('',0);
-my ($version, $delete, $test_parameter,$char_encoding, $verbose);
+my ($version, $delete, $test_parameter,$marcFlavour, $verbose);
+
GetOptions(
- 'file:s' => \$input_marc_file,
- 'n' => \$number,
- 'h' => \$version,
- 'd' => \$delete,
- 't' => \$test_parameter,
- 'c:s' => \$char_encoding,
- 'v:s' => \$verbose,
+ 'file:s' => \$input_marc_file,
+ 'n' => \$number,
+ 'h' => \$version,
+ 'd' => \$delete,
+ 't' => \$test_parameter,
+ 'c:s' => \$marcFlavour,
+ 'v:s' => \$verbose,
);
+# FIXME: Management of error conditions needed for record parsing problems
+# and MARC8 character sets with mappings to Unicode not yet included in
+# MARC::Charset. The real world rarity of these problems is not fully tested.
+# Unmapped character sets will throw a warning currently and processing will
+# continue with the error condition. A fairly trivial correction should
+# address some record parsing and unmapped character set problems but I need
+# time to implement a test and correction for undef subfields and revert to
+# MARC8 if mappings are missing. -- thd
+sub fMARC8ToUTF8($$) {
+ my ($record) = shift;
+ my ($verbose) = shift;
+ if ($verbose) {
+ if ($verbose >= 2) {
+ my $leader = $record->leader();
+ $leader =~ s/ /#/g;
+ print "\n000 " . $leader;
+ }
+ }
+ foreach my $field ($record->fields()) {
+ if ($field->is_control_field()) {
+ if ($verbose) {
+ if ($verbose >= 2) {
+ my $fieldName = $field->tag();
+ my $fieldValue = $field->data();
+ $fieldValue =~ s/ /#/g;
+ print "\n" . $fieldName;
+ print ' ' . $fieldValue;
+ }
+ }
+ } else {
+ my @subfieldsArray;
+ my $fieldName = $field->tag();
+ my $indicator1Value = $field->indicator(1);
+ my $indicator2Value = $field->indicator(2);
+ if ($verbose) {
+ if ($verbose >= 2) {
+ $indicator1Value =~ s/ /#/;
+ $indicator2Value =~ s/ /#/;
+ print "\n" . $fieldName . ' ' .
+ $indicator1Value .
+ $indicator2Value;
+ }
+ }
+ foreach my $subfield ($field->subfields()) {
+ my $subfieldName = $subfield->[0];
+ my $subfieldValue = $subfield->[1];
+ $subfieldValue =
MARC::Charset::marc8_to_utf8($subfieldValue);
+
+ # Alas, MARC::Field::update() does not work
correctly.
+ ## push (@subfieldsArray, $subfieldName,
$subfieldValue);
+
+ push @subfieldsArray, [$subfieldName,
$subfieldValue];
+ if ($verbose) {
+ if ($verbose >= 2) {
+ print " \$" . $subfieldName . '
' . $subfieldValue;
+ }
+ }
+ }
+
+ # Alas, MARC::Field::update() does not work correctly.
+ #
+ # The first instance in the field of a of a repeated
subfield
+ # overwrites the content from later instances with the
content
+ # from the first instance.
+ ## $field->update(@subfieldsArray);
+
+ foreach my $subfieldRow(@subfieldsArray) {
+ my $subfieldName = $subfieldRow->[0];
+ $field->delete_subfields($subfieldName);
+ }
+ foreach my $subfieldRow(@subfieldsArray) {
+ $field->add_subfields(@$subfieldRow);
+ }
+
+ if ($verbose) {
+ if ($verbose >= 2) {
+ # Reading the indicator values again is
not necessary.
+ # They were not converted.
+ # $indicator1Value =
$field->indicator(1);
+ # $indicator2Value =
$field->indicator(2);
+ # $indicator1Value =~ s/ /#/;
+ # $indicator2Value =~ s/ /#/;
+ print "\nCONVERTED TO UTF-8:\n" .
$fieldName . ' ' .
+ $indicator1Value .
+ $indicator2Value;
+ foreach my $subfield
($field->subfields()) {
+ my $subfieldName =
$subfield->[0];
+ my $subfieldValue =
$subfield->[1];
+ print " \$" . $subfieldName . '
' . $subfieldValue;
+ }
+ }
+ }
+ if ($verbose) {
+ if ($verbose >= 2) {
+ print "\n" if $verbose;
+ }
+ }
+ }
+ }
+ $record->encoding('UTF-8');
+ return $record;
+}
+
+
if ($version || ($input_marc_file eq '')) {
print <<EOF
small script to import an iso2709 file into Koha.
@@ -33,8 +144,9 @@
\tv : verbose mode. 1 means "some infos", 2 means "MARC dumping"
\tn : the number of the record to import. If missing, all the file is imported
\tt : test mode : parses the file, saying what he would do, but doing nothing.
-\tc : the char encoding. At the moment, only MARC21 and UNIMARC supported.
MARC21 by default.
-\d : delete EVERYTHING related to biblio in koha-DB before import :tables :
+\tc : the characteristic MARC flavour. At the moment, only MARC21 and UNIMARC
+\tsupported. MARC21 by default.
+\td : delete EVERYTHING related to biblio in koha-DB before import :tables :
\t\tbiblio, \t\tbiblioitems, \t\tsubjects,\titems
\t\tadditionalauthors, \tbibliosubtitles, \tmarc_biblio,
\t\tmarc_subfield_table, \tmarc_word, \t\tmarc_blob_subfield
@@ -66,8 +178,8 @@
print "TESTING MODE ONLY\n DOING NOTHING\n===============\n";
}
-$char_encoding = 'MARC21' unless ($char_encoding);
-print "CHAR : $char_encoding\n" if $verbose;
+$marcFlavour = 'MARC21' unless ($marcFlavour);
+print "Characteristic MARC flavour: $marcFlavour\n" if $verbose;
my $starttime = gettimeofday;
my $batch = MARC::Batch->new( 'USMARC', $input_marc_file );
$batch->warnings_off();
@@ -77,23 +189,64 @@
my ($tagfield,$tagsubfield) =
&MARCfind_marc_from_kohafield($dbh,"items.itemnumber",'');
# $dbh->do("lock tables biblio write, biblioitems write, items write,
marc_biblio write, marc_subfield_table write, marc_blob_subfield write,
marc_word write, marc_subfield_structure write, stopwords write");
while ( my $record = $batch->next() ) {
- #FIXME: it's kind of silly to go from MARC::Record to MARC::File::XML and
then back again just to fix the encoding
- my $uxml = $record->as_xml;
- $record = MARC::Record::new_from_xml($uxml, 'UTF-8');
$i++;
+#FIXME: it's kind of silly to go from MARC::Record to MARC::File::XML and
+ # then back again just to fix the encoding
+ #
+ # It is even sillier when the conversion too frequently produces errors
+ # instead of fixing the encoding. Hence, the following MARC::File::XML
+ # lines are now commented out until character set conversion in XML
+ # works better. -- thd
+ ## my $uxml = $record->as_xml;
+ ## $record = MARC::Record::new_from_xml($uxml, 'UTF-8');
+
+ # Check record encoding and convert encoding if necessary.
+
+ if ($marcFlavour eq 'MARC21') {
+ my $tag000_pos09;
+ if ($record->encoding() eq 'UTF-8') {
+ if ($verbose) {
+ print "\nRecord $i encoding is UTF-8\n";
+ $tag000_pos09 = substr ($record->leader, 9, 1);
+ $tag000_pos09 =~ s/ /#/;
+ print "\nUTF-8 LEADER/09: " . $tag000_pos09
."\n";
+ }
+ } elsif ($record->encoding() eq 'MARC-8') {
+ print "\nConverting record $i encoding from MARC8 to
UTF-8\n";
+ # Convert MARC-8 to UTF-8
+ $record = fMARC8ToUTF8($record, $verbose);
+ if ($verbose) {
+ print "\nRecord $i encoding has been converted
to UTF-8\n";
+ $tag000_pos09 = substr ($record->leader, 9, 1);
+ $tag000_pos09 =~ s/ /#/;
+ print "\nUTF-8 LEADER/09: " . $tag000_pos09
."\n";
+ }
+ }
+ } elsif ($marcFlavour eq 'UNIMARC') {
+ # I have not developed a UNIMARC character encoding conversion
script
+ # yet. Common encodings should be easy. Less comon and
multiple
+ # encodings will need extra work. I am happy to work on this
if there
+ # is some interest. -- thd
+ }
+
#now, parse the record, extract the item fields, and store them in
somewhere else.
- ## create an empty record object to populate
- my $newRecord = MARC::Record->new();
+ ## create an empty record object to populate
+ my $newRecord = MARC::Record->new();
$newRecord->leader($record->leader());
- # go through each field in the existing record
- foreach my $oldField ( $record->fields() ) {
+ # go through each field in the existing record
+ foreach my $oldField ( $record->fields() ) {
# just reproduce tags < 010 in our new record
- if ( $oldField->tag() < 10 ) {
- $newRecord->append_fields( $oldField );
- next();
+ #
+ # Fields are not necessarily only numeric in the actual world of
records
+ # nor in what I would recommend for additonal safe non-interfering local
+ # use fields. The following regular expression match is much safer
than
+ # a numeric evaluation. -- thd
+ if ( $oldField->tag() =~ m/^00/ ) {
+ $newRecord->append_fields( $oldField );
+ next();
}
# store our new subfield data in this list
@@ -103,23 +256,25 @@
foreach my $pair ( $oldField->subfields() ) {
$pair->[1] =~ s/\<//g;
$pair->[1] =~ s/\>//g;
- push( @newSubfields, $pair->[0],
char_decode($pair->[1],$char_encoding) );
+ push( @newSubfields, $pair->[0],
char_decode($pair->[1],$marcFlavour) );
}
# add the new field to our new record
my $newField = MARC::Field->new(
- $oldField->tag(),
- $oldField->indicator(1),
- $oldField->indicator(2),
- @newSubfields
+ $oldField->tag(),
+ $oldField->indicator(1),
+ $oldField->indicator(2),
+ @newSubfields
);
$newRecord->append_fields( $newField );
- }
+ }
- warn "$i ==>".$newRecord->as_formatted() if $verbose eq 2;
+ if ($verbose) {
+# warn "$i ==>".$newRecord->as_formatted() if $verbose eq 2;
+ }
my @fields = $newRecord->field($tagfield);
my @items;
my $nbitems=0;
- [Koha-cvs] koha/misc/migration_tools bulkmarcimport.pl [rel_2_2],
Thomas D <=