← Index
NYTProf Performance Profile   « line view »
For svc/members/upsert
  Run on Tue Jan 13 11:50:22 2015
Reported on Tue Jan 13 12:09:50 2015

Filename/mnt/catalyst/koha/C4/Charset.pm
StatementsExecuted 355 statements in 4.46ms
Subroutines
Calls P F Exclusive
Time
Inclusive
Time
Subroutine
1112.32ms2.99msC4::Charset::::BEGIN@24C4::Charset::BEGIN@24
111434µs448µsC4::Charset::::BEGIN@20C4::Charset::BEGIN@20
11110µs13µsC4::Charset::::BEGIN@25C4::Charset::BEGIN@25
1119µs38µsC4::Charset::::BEGIN@23C4::Charset::BEGIN@23
1119µs9µsC4::Charset::::BEGIN@31C4::Charset::BEGIN@31
1119µs41µsC4::Charset::::BEGIN@27C4::Charset::BEGIN@27
1118µs72µsC4::Charset::::BEGIN@26C4::Charset::BEGIN@26
1117µs14µsC4::Charset::::BEGIN@21C4::Charset::BEGIN@21
1116µs50µsC4::Charset::::BEGIN@29C4::Charset::BEGIN@29
0000s0sC4::Charset::::IsStringUTF8ishC4::Charset::IsStringUTF8ish
0000s0sC4::Charset::::MarcToUTF8RecordC4::Charset::MarcToUTF8Record
0000s0sC4::Charset::::NormalizeStringC4::Charset::NormalizeString
0000s0sC4::Charset::::SetMarcUnicodeFlagC4::Charset::SetMarcUnicodeFlag
0000s0sC4::Charset::::SetUTF8FlagC4::Charset::SetUTF8Flag
0000s0sC4::Charset::::StripNonXmlCharsC4::Charset::StripNonXmlChars
0000s0sC4::Charset::::__ANON__[:546]C4::Charset::__ANON__[:546]
0000s0sC4::Charset::::_default_marc21_charconv_to_utf8C4::Charset::_default_marc21_charconv_to_utf8
0000s0sC4::Charset::::_default_unimarc_charconv_to_utf8C4::Charset::_default_unimarc_charconv_to_utf8
0000s0sC4::Charset::::_marc_iso5426_to_utf8C4::Charset::_marc_iso5426_to_utf8
0000s0sC4::Charset::::_marc_marc8_to_utf8C4::Charset::_marc_marc8_to_utf8
0000s0sC4::Charset::::_marc_to_utf8_replacement_charC4::Charset::_marc_to_utf8_replacement_char
0000s0sC4::Charset::::_marc_to_utf8_via_text_iconvC4::Charset::_marc_to_utf8_via_text_iconv
0000s0sC4::Charset::::char_decode5426C4::Charset::char_decode5426
0000s0sC4::Charset::::nsb_cleanC4::Charset::nsb_clean
Call graph for these subroutines as a Graphviz dot language file.
Line State
ments
Time
on line
Calls Time
in subs
Code
1package C4::Charset;
2
3# Copyright (C) 2008 LibLime
4#
5# This file is part of Koha.
6#
7# Koha is free software; you can redistribute it and/or modify it under the
8# terms of the GNU General Public License as published by the Free Software
9# Foundation; either version 2 of the License, or (at your option) any later
10# version.
11#
12# Koha is distributed in the hope that it will be useful, but WITHOUT ANY
13# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14# A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15#
16# You should have received a copy of the GNU General Public License along
17# with Koha; if not, write to the Free Software Foundation, Inc.,
18# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19
20228µs2462µs
# spent 448µs (434+14) within C4::Charset::BEGIN@20 which was called: # once (434µs+14µs) by C4::Biblio::BEGIN@37 at line 20
use strict;
# spent 448µs making 1 call to C4::Charset::BEGIN@20 # spent 14µs making 1 call to strict::import
21224µs220µs
# spent 14µs (7+6) within C4::Charset::BEGIN@21 which was called: # once (7µs+6µs) by C4::Biblio::BEGIN@37 at line 21
use warnings;
# spent 14µs making 1 call to C4::Charset::BEGIN@21 # spent 6µs making 1 call to warnings::import
22
23225µs267µs
# spent 38µs (9+29) within C4::Charset::BEGIN@23 which was called: # once (9µs+29µs) by C4::Biblio::BEGIN@37 at line 23
use MARC::Charset qw/marc8_to_utf8/;
# spent 38µs making 1 call to C4::Charset::BEGIN@23 # spent 29µs making 1 call to Exporter::import
242679µs23.00ms
# spent 2.99ms (2.32+670µs) within C4::Charset::BEGIN@24 which was called: # once (2.32ms+670µs) by C4::Biblio::BEGIN@37 at line 24
use Text::Iconv;
# spent 2.99ms making 1 call to C4::Charset::BEGIN@24 # spent 15µs making 1 call to Exporter::import
25222µs215µs
# spent 13µs (10+3) within C4::Charset::BEGIN@25 which was called: # once (10µs+3µs) by C4::Biblio::BEGIN@37 at line 25
use C4::Context;
# spent 13µs making 1 call to C4::Charset::BEGIN@25 # spent 3µs making 1 call to C4::Context::import
26225µs2136µs
# spent 72µs (8+64) within C4::Charset::BEGIN@26 which was called: # once (8µs+64µs) by C4::Biblio::BEGIN@37 at line 26
use C4::Debug;
# spent 72µs making 1 call to C4::Charset::BEGIN@26 # spent 64µs making 1 call to Exporter::import
27227µs273µs
# spent 41µs (9+32) within C4::Charset::BEGIN@27 which was called: # once (9µs+32µs) by C4::Biblio::BEGIN@37 at line 27
use Unicode::Normalize;
# spent 41µs making 1 call to C4::Charset::BEGIN@27 # spent 32µs making 1 call to Exporter::import
28
29247µs294µs
# spent 50µs (6+44) within C4::Charset::BEGIN@29 which was called: # once (6µs+44µs) by C4::Biblio::BEGIN@37 at line 29
use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
# spent 50µs making 1 call to C4::Charset::BEGIN@29 # spent 44µs making 1 call to vars::import
30
31
# spent 9µs within C4::Charset::BEGIN@31 which was called: # once (9µs+0s) by C4::Biblio::BEGIN@37 at line 45
BEGIN {
32 # set the version for version checking
331800ns $VERSION = 3.07.00.049;
341400ns require Exporter;
3515µs @ISA = qw(Exporter);
3614µs @EXPORT = qw(
37 NormalizeString
38 IsStringUTF8ish
39 MarcToUTF8Record
40 SetUTF8Flag
41 SetMarcUnicodeFlag
42 StripNonXmlChars
43 nsb_clean
44 );
4513.47ms19µs}
# spent 9µs making 1 call to C4::Charset::BEGIN@31
46
47=head1 NAME
48
49C4::Charset - utilities for handling character set conversions.
50
51=head1 SYNOPSIS
52
53 use C4::Charset;
54
55=head1 DESCRIPTION
56
57This module contains routines for dealing with character set
58conversions, particularly for MARC records.
59
60A variety of character encodings are in use by various MARC
61standards, and even more character encodings are used by
62non-standard MARC records. The various MARC formats generally
63do not do a good job of advertising a given record's character
64encoding, and even when a record does advertise its encoding,
65e.g., via the Leader/09, experience has shown that one cannot
66trust it.
67
68Ultimately, all MARC records are stored in Koha in UTF-8 and
69must be converted from whatever the source character encoding is.
70The goal of this module is to ensure that these conversions
71take place accurately. When a character conversion cannot take
72place, or at least not accurately, the module was provide
73enough information to allow user-facing code to inform the user
74on how to deal with the situation.
75
76=cut
77
78=head1 FUNCTIONS
79
80=head2 IsStringUTF8ish
81
82 my $is_utf8 = IsStringUTF8ish($str);
83
84Determines if C<$str> is valid UTF-8. This can mean
85one of two things:
86
87=over
88
89=item *
90
91The Perl UTF-8 flag is set and the string contains valid UTF-8.
92
93=item *
94
95The Perl UTF-8 flag is B<not> set, but the octets contain
96valid UTF-8.
97
98=back
99
100The function is named C<IsStringUTF8ish> instead of C<IsStringUTF8>
101because in one could be presented with a MARC blob that is
102not actually in UTF-8 but whose sequence of octets appears to be
103valid UTF-8. The rest of the MARC character conversion functions
104will assume that this situation occur does not very often.
105
106=cut
107
108sub IsStringUTF8ish {
109 my $str = shift;
110
111 return 1 if utf8::is_utf8($str);
112 return utf8::decode($str);
113}
114
115=head2 SetUTF8Flag
116
117 my $marc_record = SetUTF8Flag($marc_record, $nfd);
118
119This function sets the PERL UTF8 flag for data.
120It is required when using new_from_usmarc
121since MARC::File::USMARC does not handle PERL UTF8 setting.
122When editing unicode marc records fields and subfields, you
123would end up in double encoding without using this function.
124
125If $nfd is set, string normalization will use NFD instead of NFC
126
127FIXME
128In my opinion, this function belongs to MARC::Record and not
129to this package.
130But since it handles charset, and MARC::Record, it finds its way in that package
131
132=cut
133
134sub SetUTF8Flag{
135 my ($record, $nfd)=@_;
136 return unless ($record && $record->fields());
137 foreach my $field ($record->fields()){
138 if ($field->tag()>=10){
139 my @subfields;
140 foreach my $subfield ($field->subfields()){
141 push @subfields,($$subfield[0],NormalizeString($$subfield[1],$nfd));
142 }
143 eval {
144 my $newfield=MARC::Field->new(
145 $field->tag(),
146 $field->indicator(1),
147 $field->indicator(2),
148 @subfields
149 );
150 $field->replace_with($newfield);
151 };
152 warn "ERROR occurred in SetUTF8Flag $@" if $@;
153 }
154 }
155}
156
157=head2 NormalizeString
158
159 my $normalized_string=NormalizeString($string,$nfd,$transform);
160
161Given a string
162nfd : If you want to set NFD and not NFC
163transform : If you expect all the signs to be removed
164
165Sets the PERL UTF8 Flag on your initial data if need be
166and applies cleaning if required
167
168Returns a utf8 NFC normalized string
169
170Sample code :
171 my $string=NormalizeString ("l'ornithoptère");
172 #results into ornithoptère in NFC form and sets UTF8 Flag
173
174=cut
175
176
177sub NormalizeString{
178 my ($string,$nfd,$transform)=@_;
179 utf8::decode($string) unless (utf8::is_utf8($string));
180 if ($nfd){
181 $string= NFD($string);
182 }
183 else {
184 $string=NFC($string);
185 }
186 if ($transform){
187 $string=~s/\<|\>|\^|\;|\.|\?|,|\-|\(|\)|\[|\]|\{|\}|\$|\%|\!|\*|\:|\\|\/|\&|\"|\'/ /g;
188 #removing one letter words "d'" "l'" was changed into "d " "l "
189 $string=~s/\b\S\b//g;
190 $string=~s/\s+$//g;
191 }
192 return $string;
193}
194
195=head2 MarcToUTF8Record
196
197 ($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob,
198 $marc_flavour, [, $source_encoding]);
199
200Given a MARC blob or a C<MARC::Record>, the MARC flavour, and an
201optional source encoding, return a C<MARC::Record> that is
202converted to UTF-8.
203
204The returned C<$marc_record> is guaranteed to be in valid UTF-8, but
205is not guaranteed to have been converted correctly. Specifically,
206if C<$converted_from> is 'failed', the MARC record returned failed
207character conversion and had each of its non-ASCII octets changed
208to the Unicode replacement character.
209
210If the source encoding was not specified, this routine will
211try to guess it; the character encoding used for a successful
212conversion is returned in C<$converted_from>.
213
214=cut
215
216sub MarcToUTF8Record {
217 my $marc = shift;
218 my $marc_flavour = shift;
219 my $source_encoding = shift;
220 my $marc_record;
221 my $marc_blob_is_utf8 = 0;
222 if (ref($marc) eq 'MARC::Record') {
223 my $marc_blob = $marc->as_usmarc();
224 $marc_blob_is_utf8 = IsStringUTF8ish($marc_blob);
225 $marc_record = $marc;
226 } else {
227 # dealing with a MARC blob
228
229 # remove any ersatz whitespace from the beginning and
230 # end of the MARC blob -- these can creep into MARC
231 # files produced by several sources -- caller really
232 # should be doing this, however
233 $marc =~ s/^\s+//;
234 $marc =~ s/\s+$//;
235 $marc_blob_is_utf8 = IsStringUTF8ish($marc);
236 eval {
237 $marc_record = MARC::Record->new_from_usmarc($marc);
238 };
239 if ($@) {
240 # if we fail the first time, one likely problem
241 # is that we have a MARC21 record that says that it's
242 # UTF-8 (Leader/09 = 'a') but contains non-UTF-8 characters.
243 # We'll try parsing it again.
244 substr($marc, 9, 1) = ' ';
245 eval {
246 $marc_record = MARC::Record->new_from_usmarc($marc);
247 };
248 if ($@) {
249 # it's hopeless; return an empty MARC::Record
250 return MARC::Record->new(), 'failed', ['could not parse MARC blob'];
251 }
252 }
253 }
254
255 # If we do not know the source encoding, try some guesses
256 # as follows:
257 # 1. Record is UTF-8 already.
258 # 2. If MARC flavor is MARC21 or NORMARC, then
259 # a. record is MARC-8
260 # b. record is ISO-8859-1
261 # 3. If MARC flavor is UNIMARC, then
262 if (not defined $source_encoding) {
263 if ($marc_blob_is_utf8) {
264 # note that for MARC21/NORMARC we are not bothering to check
265 # if the Leader/09 is set to 'a' or not -- because
266 # of problems with various ILSs (including Koha in the
267 # past, alas), this just is not trustworthy.
268 SetMarcUnicodeFlag($marc_record, $marc_flavour);
269 return $marc_record, 'UTF-8', [];
270 } else {
271 if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
272 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
273 } elsif ($marc_flavour =~/UNIMARC/) {
274 return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour);
275 } else {
276 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
277 }
278 }
279 } else {
280 # caller knows the character encoding
281 my $original_marc_record = $marc_record->clone();
282 my @errors;
283 if ($source_encoding =~ /utf-?8/i) {
284 if ($marc_blob_is_utf8) {
285 SetMarcUnicodeFlag($marc_record, $marc_flavour);
286 return $marc_record, 'UTF-8', [];
287 } else {
288 push @errors, 'specified UTF-8 => UTF-8 conversion, but record is not in UTF-8';
289 }
290 } elsif ($source_encoding =~ /marc-?8/i) {
291 @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour);
292 } elsif ($source_encoding =~ /5426/) {
293 @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour);
294 } else {
295 # assume any other character encoding is for Text::Iconv
296 @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
297 }
298
299 if (@errors) {
300 _marc_to_utf8_replacement_char($original_marc_record, $marc_flavour);
301 return $original_marc_record, 'failed', \@errors;
302 } else {
303 return $marc_record, $source_encoding, [];
304 }
305 }
306
307}
308
309=head2 SetMarcUnicodeFlag
310
311 SetMarcUnicodeFlag($marc_record, $marc_flavour);
312
313Set both the internal MARC::Record encoding flag
314and the appropriate Leader/09 (MARC21) or
315100/26-29 (UNIMARC) to indicate that the record
316is in UTF-8. Note that this does B<not> do
317any actual character conversion.
318
319=cut
320
321sub SetMarcUnicodeFlag {
322 my $marc_record = shift;
323 my $marc_flavour = shift; # || C4::Context->preference("marcflavour");
324
325 $marc_record->encoding('UTF-8');
326 if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
327 my $leader = $marc_record->leader();
328 substr($leader, 9, 1) = 'a';
329 $marc_record->leader($leader);
330 } elsif ($marc_flavour =~/UNIMARC/) {
331 my $defaultlanguage = C4::Context->preference("UNIMARCField100Language");
332 $defaultlanguage = "fre" if (!$defaultlanguage || length($defaultlanguage) != 3);
333 my $string;
334 my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?(21,12):(36,25));
335 $string=$marc_record->subfield( 100, "a" );
336 if (defined $string && length($string)==$subflength) {
337 $string = substr $string, 0,$subflength if (length($string)>$subflength);
338 }
339 else {
340 $string = POSIX::strftime( "%Y%m%d", localtime );
341 $string =~ s/\-//g;
342 $string = sprintf( "%-*s", $subflength, $string );
343 substr ( $string, ($encodingposition - 3), 3, $defaultlanguage);
344 }
345 substr( $string, $encodingposition, 3, "y50" );
346 if ( $marc_record->subfield( 100, "a" ) ) {
347 $marc_record->field('100')->update(a=>$string);
348 }
349 else {
350 $marc_record->insert_grouped_field(
351 MARC::Field->new( 100, '', '', "a" => $string ) );
352 }
353 $debug && warn "encodage: ", substr( $marc_record->subfield(100, 'a'), $encodingposition, 3 );
354 } else {
355 warn "Unrecognized marcflavour: $marc_flavour";
356 }
357}
358
359=head2 StripNonXmlChars
360
361 my $new_str = StripNonXmlChars($old_str);
362
363Given a string, return a copy with the
364characters that are illegal in XML
365removed.
366
367This function exists to work around a problem
368that can occur with badly-encoded MARC records.
369Specifically, if a UTF-8 MARC record also
370has excape (\x1b) characters, MARC::File::XML
371will let the escape characters pass through
372when as_xml() or as_xml_record() is called. The
373problem is that the escape character is not
374legal in well-formed XML documents, so when
375MARC::File::XML attempts to parse such a record,
376the XML parser will fail.
377
378Stripping such characters will allow a
379MARC::Record->new_from_xml()
380to work, at the possible risk of some data loss.
381
382=cut
383
384sub StripNonXmlChars {
385 my $str = shift;
386 if (!defined($str) || $str eq ""){
387 return "";
388 }
389 $str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g;
390 return $str;
391}
392
- -
395=head2 nsb_clean
396
397=over 4
398
399nsb_clean($string);
400
401=back
402
403Removes Non Sorting Block characters
404
405=cut
406sub nsb_clean {
407 my $NSB = '\x88' ; # NSB : begin Non Sorting Block
408 my $NSE = '\x89' ; # NSE : Non Sorting Block end
409 my $NSB2 = '\x98' ; # NSB : begin Non Sorting Block
410 my $NSE2 = '\x9C' ; # NSE : Non Sorting Block end
411 my $C2 = '\xC2' ; # What is this char ? It is sometimes left by the regexp after removing NSB / NSE
412
413 # handles non sorting blocks
414 my ($string) = @_ ;
415 $_ = $string ;
416 s/$NSB//g ;
417 s/$NSE//g ;
418 s/$NSB2//g ;
419 s/$NSE2//g ;
420 s/$C2//g ;
421 $string = $_ ;
422
423 return($string) ;
424}
425
426
427=head1 INTERNAL FUNCTIONS
428
429=head2 _default_marc21_charconv_to_utf8
430
431 my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record);
432
433Converts a C<MARC::Record> of unknown character set to UTF-8,
434first by trying a MARC-8 to UTF-8 conversion, then ISO-8859-1
435to UTF-8, then a default conversion that replaces each non-ASCII
436character with the replacement character.
437
438The C<$guessed_charset> return value contains the character set
439that resulted in a conversion to valid UTF-8; note that
440if the MARC-8 and ISO-8859-1 conversions failed, the value of
441this is 'failed'.
442
443=cut
444
445sub _default_marc21_charconv_to_utf8 {
446 my $marc_record = shift;
447 my $marc_flavour = shift;
448
449 my $trial_marc8 = $marc_record->clone();
450 my @all_errors = ();
451 my @errors = _marc_marc8_to_utf8($trial_marc8, $marc_flavour);
452 unless (@errors) {
453 return $trial_marc8, 'MARC-8', [];
454 }
455 push @all_errors, @errors;
456
457 my $trial_8859_1 = $marc_record->clone();
458 @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
459 unless (@errors) {
460 return $trial_8859_1, 'iso-8859-1', []; # note -- we could return \@all_errors
461 # instead if we wanted to report details
462 # of the failed attempt at MARC-8 => UTF-8
463 }
464 push @all_errors, @errors;
465
466 my $default_converted = $marc_record->clone();
467 _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
468 return $default_converted, 'failed', \@all_errors;
469}
470
471=head2 _default_unimarc_charconv_to_utf8
472
473 my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record);
474
475Converts a C<MARC::Record> of unknown character set to UTF-8,
476first by trying a ISO-5426 to UTF-8 conversion, then ISO-8859-1
477to UTF-8, then a default conversion that replaces each non-ASCII
478character with the replacement character.
479
480The C<$guessed_charset> return value contains the character set
481that resulted in a conversion to valid UTF-8; note that
482if the MARC-8 and ISO-8859-1 conversions failed, the value of
483this is 'failed'.
484
485=cut
486
487sub _default_unimarc_charconv_to_utf8 {
488 my $marc_record = shift;
489 my $marc_flavour = shift;
490
491 my $trial_marc8 = $marc_record->clone();
492 my @all_errors = ();
493 my @errors = _marc_iso5426_to_utf8($trial_marc8, $marc_flavour);
494 unless (@errors) {
495 return $trial_marc8, 'iso-5426';
496 }
497 push @all_errors, @errors;
498
499 my $trial_8859_1 = $marc_record->clone();
500 @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
501 unless (@errors) {
502 return $trial_8859_1, 'iso-8859-1';
503 }
504 push @all_errors, @errors;
505
506 my $default_converted = $marc_record->clone();
507 _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
508 return $default_converted, 'failed', \@all_errors;
509}
510
511=head2 _marc_marc8_to_utf8
512
513 my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding);
514
515Convert a C<MARC::Record> to UTF-8 in-place from MARC-8.
516If the conversion fails for some reason, an
517appropriate messages will be placed in the returned
518C<@errors> array.
519
520=cut
521
522sub _marc_marc8_to_utf8 {
523 my $marc_record = shift;
524 my $marc_flavour = shift;
525
526 my $prev_ignore = MARC::Charset->ignore_errors();
527 MARC::Charset->ignore_errors(1);
528
529 # trap warnings raised by MARC::Charset
530 my @errors = ();
531 local $SIG{__WARN__} = sub {
532 my $msg = $_[0];
533 if ($msg =~ /MARC.Charset/) {
534 # FIXME - purpose of this regexp is to strip out the
535 # line reference to MARC/Charset.pm, but as it
536 # exists probably won't work quite on Windows --
537 # some sort of minimal-bunch back-tracking RE
538 # would be helpful here
539 $msg =~ s/at [\/].*?.MARC.Charset\.pm line \d+\.\n$//;
540 push @errors, $msg;
541 } else {
542 # if warning doesn't come from MARC::Charset, just
543 # pass it on
544 warn $msg;
545 }
546 };
547
548 foreach my $field ($marc_record->fields()) {
549 if ($field->is_control_field()) {
550 ; # do nothing -- control fields should not contain non-ASCII characters
551 } else {
552 my @converted_subfields;
553 foreach my $subfield ($field->subfields()) {
554 my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]);
555 unless (IsStringUTF8ish($utf8sf)) {
556 # Because of a bug in MARC::Charset 0.98, if the string
557 # has (a) one or more diacritics that (b) are only in character positions
558 # 128 to 255 inclusive, the resulting converted string is not in
559 # UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1). If that
560 # occurs, upgrade the string in place. Moral of the story seems to be
561 # that pack("U", ...) is better than chr(...) if you need to guarantee
562 # that the resulting string is UTF-8.
563 utf8::upgrade($utf8sf);
564 }
565 push @converted_subfields, $subfield->[0], $utf8sf;
566 }
567
568 $field->replace_with(MARC::Field->new(
569 $field->tag(), $field->indicator(1), $field->indicator(2),
570 @converted_subfields)
571 );
572 }
573 }
574
575 MARC::Charset->ignore_errors($prev_ignore);
576
577 SetMarcUnicodeFlag($marc_record, $marc_flavour);
578
579 return @errors;
580}
581
582=head2 _marc_iso5426_to_utf8
583
584 my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding);
585
586Convert a C<MARC::Record> to UTF-8 in-place from ISO-5426.
587If the conversion fails for some reason, an
588appropriate messages will be placed in the returned
589C<@errors> array.
590
591FIXME - is ISO-5426 equivalent enough to MARC-8
592that C<MARC::Charset> can be used instead?
593
594=cut
595
596sub _marc_iso5426_to_utf8 {
597 my $marc_record = shift;
598 my $marc_flavour = shift;
599
600 my @errors = ();
601
602 foreach my $field ($marc_record->fields()) {
603 if ($field->is_control_field()) {
604 ; # do nothing -- control fields should not contain non-ASCII characters
605 } else {
606 my @converted_subfields;
607 foreach my $subfield ($field->subfields()) {
608 my $utf8sf = char_decode5426($subfield->[1]);
609 push @converted_subfields, $subfield->[0], $utf8sf;
610 }
611
612 $field->replace_with(MARC::Field->new(
613 $field->tag(), $field->indicator(1), $field->indicator(2),
614 @converted_subfields)
615 );
616 }
617 }
618
619 SetMarcUnicodeFlag($marc_record, $marc_flavour);
620
621 return @errors;
622}
623
624=head2 _marc_to_utf8_via_text_iconv
625
626 my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
627
628Convert a C<MARC::Record> to UTF-8 in-place using the
629C<Text::Iconv> CPAN module. Any source encoding accepted
630by the user's iconv installation should work. If
631the source encoding is not recognized on the user's
632server or the conversion fails for some reason,
633appropriate messages will be placed in the returned
634C<@errors> array.
635
636=cut
637
638sub _marc_to_utf8_via_text_iconv {
639 my $marc_record = shift;
640 my $marc_flavour = shift;
641 my $source_encoding = shift;
642
643 my @errors = ();
644 my $decoder;
645 eval { $decoder = Text::Iconv->new($source_encoding, 'utf8'); };
646 if ($@) {
647 push @errors, "Could not initialze $source_encoding => utf8 converter: $@";
648 return @errors;
649 }
650
651 my $prev_raise_error = Text::Iconv->raise_error();
652 Text::Iconv->raise_error(1);
653
654 foreach my $field ($marc_record->fields()) {
655 if ($field->is_control_field()) {
656 ; # do nothing -- control fields should not contain non-ASCII characters
657 } else {
658 my @converted_subfields;
659 foreach my $subfield ($field->subfields()) {
660 my $converted_value;
661 my $conversion_ok = 1;
662 eval { $converted_value = $decoder->convert($subfield->[1]); };
663 if ($@) {
664 $conversion_ok = 0;
665 push @errors, $@;
666 } elsif (not defined $converted_value) {
667 $conversion_ok = 0;
668 push @errors, "Text::Iconv conversion failed - retval is " . $decoder->retval();
669 }
670
671 if ($conversion_ok) {
672 push @converted_subfields, $subfield->[0], $converted_value;
673 } else {
674 $converted_value = $subfield->[1];
675 $converted_value =~ s/[\200-\377]/\xef\xbf\xbd/g;
676 push @converted_subfields, $subfield->[0], $converted_value;
677 }
678 }
679
680 $field->replace_with(MARC::Field->new(
681 $field->tag(), $field->indicator(1), $field->indicator(2),
682 @converted_subfields)
683 );
684 }
685 }
686
687 SetMarcUnicodeFlag($marc_record, $marc_flavour);
688 Text::Iconv->raise_error($prev_raise_error);
689
690 return @errors;
691}
692
693=head2 _marc_to_utf8_replacement_char
694
695 _marc_to_utf8_replacement_char($marc_record, $marc_flavour);
696
697Convert a C<MARC::Record> to UTF-8 in-place, adopting the
698unsatisfactory method of replacing all non-ASCII (e.g.,
699where the eight bit is set) octet with the Unicode
700replacement character. This is meant as a last-ditch
701method, and would be best used as part of a UI that
702lets a cataloguer pick various character conversions
703until he or she finds the right one.
704
705=cut
706
707sub _marc_to_utf8_replacement_char {
708 my $marc_record = shift;
709 my $marc_flavour = shift;
710
711 foreach my $field ($marc_record->fields()) {
712 if ($field->is_control_field()) {
713 ; # do nothing -- control fields should not contain non-ASCII characters
714 } else {
715 my @converted_subfields;
716 foreach my $subfield ($field->subfields()) {
717 my $value = $subfield->[1];
718 $value =~ s/[\200-\377]/\xef\xbf\xbd/g;
719 push @converted_subfields, $subfield->[0], $value;
720 }
721
722 $field->replace_with(MARC::Field->new(
723 $field->tag(), $field->indicator(1), $field->indicator(2),
724 @converted_subfields)
725 );
726 }
727 }
728
729 SetMarcUnicodeFlag($marc_record, $marc_flavour);
730}
731
732=head2 char_decode5426
733
734 my $utf8string = char_decode5426($iso_5426_string);
735
736Converts a string from ISO-5426 to UTF-8.
737
738=cut
739
740
7411300nsmy %chars;
74211µs$chars{0xb0}=0x0101;#3/0ayn[ain]
7431200ns$chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove]
744#$chars{0xb2}=0x00e0;#'à';
7451100ns$chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark
746#$chars{0xb3}=0x00e7;#'ç';
7471100ns$chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark
748# $chars{0xb4}='è';
7491200ns$chars{0xb4}=0x00e8;
7501200ns$chars{0xbd}=0x02b9;
7511200ns$chars{0xbe}=0x02ba;
752# $chars{0xb5}='é';
7531600ns$chars{0xb5}=0x00e9;
7541200ns$chars{0x97}=0x003c;#3/2leftlowsinglequotationmark
7551100ns$chars{0x98}=0x003e;#3/2leftlowsinglequotationmark
7561200ns$chars{0xfa}=0x0153; #oe
7571100ns$chars{0xea}=0x0152; #oe
7581200ns$chars{0x81d1}=0x00b0;
759
760####
761## combined characters iso5426
762
7631200ns$chars{0xc041}=0x1ea2; # capital a with hook above
7641200ns$chars{0xc045}=0x1eba; # capital e with hook above
7651800ns$chars{0xc049}=0x1ec8; # capital i with hook above
7661100ns$chars{0xc04f}=0x1ece; # capital o with hook above
7671200ns$chars{0xc055}=0x1ee6; # capital u with hook above
7681200ns$chars{0xc059}=0x1ef6; # capital y with hook above
7691200ns$chars{0xc061}=0x1ea3; # small a with hook above
7701100ns$chars{0xc065}=0x1ebb; # small e with hook above
7711100ns$chars{0xc069}=0x1ec9; # small i with hook above
7721100ns$chars{0xc06f}=0x1ecf; # small o with hook above
7731100ns$chars{0xc075}=0x1ee7; # small u with hook above
7741200ns$chars{0xc079}=0x1ef7; # small y with hook above
775
776 # 4/1 grave accent
7771200ns$chars{0xc141}=0x00c0; # capital a with grave accent
7781100ns$chars{0xc145}=0x00c8; # capital e with grave accent
7791200ns$chars{0xc149}=0x00cc; # capital i with grave accent
7801100ns$chars{0xc14f}=0x00d2; # capital o with grave accent
7811200ns$chars{0xc155}=0x00d9; # capital u with grave accent
7821100ns$chars{0xc157}=0x1e80; # capital w with grave
78313µs$chars{0xc159}=0x1ef2; # capital y with grave
7841100ns$chars{0xc161}=0x00e0; # small a with grave accent
7851200ns$chars{0xc165}=0x00e8; # small e with grave accent
7861200ns$chars{0xc169}=0x00ec; # small i with grave accent
7871100ns$chars{0xc16f}=0x00f2; # small o with grave accent
7881100ns$chars{0xc175}=0x00f9; # small u with grave accent
7891100ns$chars{0xc177}=0x1e81; # small w with grave
7901200ns$chars{0xc179}=0x1ef3; # small y with grave
791 # 4/2 acute accent
7921100ns$chars{0xc241}=0x00c1; # capital a with acute accent
7931300ns$chars{0xc243}=0x0106; # capital c with acute accent
7941100ns$chars{0xc245}=0x00c9; # capital e with acute accent
7951200ns$chars{0xc247}=0x01f4; # capital g with acute
7961100ns$chars{0xc249}=0x00cd; # capital i with acute accent
7971100ns$chars{0xc24b}=0x1e30; # capital k with acute
7981100ns$chars{0xc24c}=0x0139; # capital l with acute accent
7991200ns$chars{0xc24d}=0x1e3e; # capital m with acute
8001100ns$chars{0xc24e}=0x0143; # capital n with acute accent
8011100ns$chars{0xc24f}=0x00d3; # capital o with acute accent
8021100ns$chars{0xc250}=0x1e54; # capital p with acute
8031200ns$chars{0xc252}=0x0154; # capital r with acute accent
8041100ns$chars{0xc253}=0x015a; # capital s with acute accent
8051200ns$chars{0xc255}=0x00da; # capital u with acute accent
8061100ns$chars{0xc257}=0x1e82; # capital w with acute
8071100ns$chars{0xc259}=0x00dd; # capital y with acute accent
8081100ns$chars{0xc25a}=0x0179; # capital z with acute accent
8091200ns$chars{0xc261}=0x00e1; # small a with acute accent
8101200ns$chars{0xc263}=0x0107; # small c with acute accent
8111100ns$chars{0xc265}=0x00e9; # small e with acute accent
8121200ns$chars{0xc267}=0x01f5; # small g with acute
8131100ns$chars{0xc269}=0x00ed; # small i with acute accent
8141100ns$chars{0xc26b}=0x1e31; # small k with acute
8151100ns$chars{0xc26c}=0x013a; # small l with acute accent
81611µs$chars{0xc26d}=0x1e3f; # small m with acute
8171100ns$chars{0xc26e}=0x0144; # small n with acute accent
8181200ns$chars{0xc26f}=0x00f3; # small o with acute accent
8191100ns$chars{0xc270}=0x1e55; # small p with acute
8201100ns$chars{0xc272}=0x0155; # small r with acute accent
8211100ns$chars{0xc273}=0x015b; # small s with acute accent
8221100ns$chars{0xc275}=0x00fa; # small u with acute accent
8231100ns$chars{0xc277}=0x1e83; # small w with acute
8241100ns$chars{0xc279}=0x00fd; # small y with acute accent
8251200ns$chars{0xc27a}=0x017a; # small z with acute accent
8261100ns$chars{0xc2e1}=0x01fc; # capital ae with acute
8271200ns$chars{0xc2f1}=0x01fd; # small ae with acute
828 # 4/3 circumflex accent
8291200ns$chars{0xc341}=0x00c2; # capital a with circumflex accent
8301200ns$chars{0xc343}=0x0108; # capital c with circumflex
8311200ns$chars{0xc345}=0x00ca; # capital e with circumflex accent
8321100ns$chars{0xc347}=0x011c; # capital g with circumflex
8331200ns$chars{0xc348}=0x0124; # capital h with circumflex
8341100ns$chars{0xc349}=0x00ce; # capital i with circumflex accent
8351200ns$chars{0xc34a}=0x0134; # capital j with circumflex
8361100ns$chars{0xc34f}=0x00d4; # capital o with circumflex accent
8371200ns$chars{0xc353}=0x015c; # capital s with circumflex
8381100ns$chars{0xc355}=0x00db; # capital u with circumflex
8391200ns$chars{0xc357}=0x0174; # capital w with circumflex
8401100ns$chars{0xc359}=0x0176; # capital y with circumflex
8411100ns$chars{0xc35a}=0x1e90; # capital z with circumflex
8421100ns$chars{0xc361}=0x00e2; # small a with circumflex accent
8431200ns$chars{0xc363}=0x0109; # small c with circumflex
8441100ns$chars{0xc365}=0x00ea; # small e with circumflex accent
8451200ns$chars{0xc367}=0x011d; # small g with circumflex
8461100ns$chars{0xc368}=0x0125; # small h with circumflex
8471200ns$chars{0xc369}=0x00ee; # small i with circumflex accent
8481100ns$chars{0xc36a}=0x0135; # small j with circumflex
8491100ns$chars{0xc36e}=0x00f1; # small n with tilde
8501200ns$chars{0xc36f}=0x00f4; # small o with circumflex accent
8511200ns$chars{0xc373}=0x015d; # small s with circumflex
8521100ns$chars{0xc375}=0x00fb; # small u with circumflex
8531200ns$chars{0xc377}=0x0175; # small w with circumflex
8541200ns$chars{0xc379}=0x0177; # small y with circumflex
8551200ns$chars{0xc37a}=0x1e91; # small z with circumflex
856 # 4/4 tilde
8571200ns$chars{0xc441}=0x00c3; # capital a with tilde
8581100ns$chars{0xc445}=0x1ebc; # capital e with tilde
8591200ns$chars{0xc449}=0x0128; # capital i with tilde
8601100ns$chars{0xc44e}=0x00d1; # capital n with tilde
8611200ns$chars{0xc44f}=0x00d5; # capital o with tilde
8621100ns$chars{0xc455}=0x0168; # capital u with tilde
8631200ns$chars{0xc456}=0x1e7c; # capital v with tilde
8641100ns$chars{0xc459}=0x1ef8; # capital y with tilde
8651200ns$chars{0xc461}=0x00e3; # small a with tilde
8661100ns$chars{0xc465}=0x1ebd; # small e with tilde
8671200ns$chars{0xc469}=0x0129; # small i with tilde
8681200ns$chars{0xc46e}=0x00f1; # small n with tilde
8691100ns$chars{0xc46f}=0x00f5; # small o with tilde
8701200ns$chars{0xc475}=0x0169; # small u with tilde
8711200ns$chars{0xc476}=0x1e7d; # small v with tilde
8721100ns$chars{0xc479}=0x1ef9; # small y with tilde
873 # 4/5 macron
8741200ns$chars{0xc541}=0x0100; # capital a with macron
8751200ns$chars{0xc545}=0x0112; # capital e with macron
8761100ns$chars{0xc547}=0x1e20; # capital g with macron
8771200ns$chars{0xc549}=0x012a; # capital i with macron
8781200ns$chars{0xc54f}=0x014c; # capital o with macron
8791100ns$chars{0xc555}=0x016a; # capital u with macron
8801200ns$chars{0xc561}=0x0101; # small a with macron
8811500ns$chars{0xc565}=0x0113; # small e with macron
8821200ns$chars{0xc567}=0x1e21; # small g with macron
88312µs$chars{0xc569}=0x012b; # small i with macron
8841200ns$chars{0xc56f}=0x014d; # small o with macron
8851200ns$chars{0xc575}=0x016b; # small u with macron
8861200ns$chars{0xc572}=0x0159; # small r with macron
8871100ns$chars{0xc5e1}=0x01e2; # capital ae with macron
8881200ns$chars{0xc5f1}=0x01e3; # small ae with macron
889 # 4/6 breve
8901100ns$chars{0xc641}=0x0102; # capital a with breve
8911200ns$chars{0xc645}=0x0114; # capital e with breve
8921100ns$chars{0xc647}=0x011e; # capital g with breve
8931200ns$chars{0xc649}=0x012c; # capital i with breve
8941100ns$chars{0xc64f}=0x014e; # capital o with breve
8951200ns$chars{0xc655}=0x016c; # capital u with breve
8961100ns$chars{0xc661}=0x0103; # small a with breve
8971200ns$chars{0xc665}=0x0115; # small e with breve
8981100ns$chars{0xc667}=0x011f; # small g with breve
8991100ns$chars{0xc669}=0x012d; # small i with breve
9001100ns$chars{0xc66f}=0x014f; # small o with breve
9011200ns$chars{0xc675}=0x016d; # small u with breve
902 # 4/7 dot above
9031100ns$chars{0xc7b0}=0x01e1; # Ain with dot above
9041200ns$chars{0xc742}=0x1e02; # capital b with dot above
9051200ns$chars{0xc743}=0x010a; # capital c with dot above
9061100ns$chars{0xc744}=0x1e0a; # capital d with dot above
9071200ns$chars{0xc745}=0x0116; # capital e with dot above
9081100ns$chars{0xc746}=0x1e1e; # capital f with dot above
9091200ns$chars{0xc747}=0x0120; # capital g with dot above
9101200ns$chars{0xc748}=0x1e22; # capital h with dot above
9111100ns$chars{0xc749}=0x0130; # capital i with dot above
9121200ns$chars{0xc74d}=0x1e40; # capital m with dot above
9131200ns$chars{0xc74e}=0x1e44; # capital n with dot above
9141100ns$chars{0xc750}=0x1e56; # capital p with dot above
9151200ns$chars{0xc752}=0x1e58; # capital r with dot above
9161100ns$chars{0xc753}=0x1e60; # capital s with dot above
9171100ns$chars{0xc754}=0x1e6a; # capital t with dot above
9181200ns$chars{0xc757}=0x1e86; # capital w with dot above
9191100ns$chars{0xc758}=0x1e8a; # capital x with dot above
9201200ns$chars{0xc759}=0x1e8e; # capital y with dot above
9211200ns$chars{0xc75a}=0x017b; # capital z with dot above
9221200ns$chars{0xc761}=0x0227; # small b with dot above
9231100ns$chars{0xc762}=0x1e03; # small b with dot above
9241200ns$chars{0xc763}=0x010b; # small c with dot above
9251100ns$chars{0xc764}=0x1e0b; # small d with dot above
9261100ns$chars{0xc765}=0x0117; # small e with dot above
9271100ns$chars{0xc766}=0x1e1f; # small f with dot above
9281200ns$chars{0xc767}=0x0121; # small g with dot above
9291100ns$chars{0xc768}=0x1e23; # small h with dot above
9301100ns$chars{0xc76d}=0x1e41; # small m with dot above
9311100ns$chars{0xc76e}=0x1e45; # small n with dot above
9321100ns$chars{0xc770}=0x1e57; # small p with dot above
9331100ns$chars{0xc772}=0x1e59; # small r with dot above
9341200ns$chars{0xc773}=0x1e61; # small s with dot above
9351100ns$chars{0xc774}=0x1e6b; # small t with dot above
9361200ns$chars{0xc777}=0x1e87; # small w with dot above
9371100ns$chars{0xc778}=0x1e8b; # small x with dot above
9381100ns$chars{0xc779}=0x1e8f; # small y with dot above
9391200ns$chars{0xc77a}=0x017c; # small z with dot above
940 # 4/8 trema, diaresis
9411100ns$chars{0xc820}=0x00a8; # diaeresis
9421200ns$chars{0xc841}=0x00c4; # capital a with diaeresis
9431200ns$chars{0xc845}=0x00cb; # capital e with diaeresis
9441100ns$chars{0xc848}=0x1e26; # capital h with diaeresis
9451200ns$chars{0xc849}=0x00cf; # capital i with diaeresis
9461100ns$chars{0xc84f}=0x00d6; # capital o with diaeresis
9471600ns$chars{0xc855}=0x00dc; # capital u with diaeresis
9481200ns$chars{0xc857}=0x1e84; # capital w with diaeresis
9491100ns$chars{0xc858}=0x1e8c; # capital x with diaeresis
9501100ns$chars{0xc859}=0x0178; # capital y with diaeresis
9511200ns$chars{0xc861}=0x00e4; # small a with diaeresis
9521100ns$chars{0xc865}=0x00eb; # small e with diaeresis
9531200ns$chars{0xc868}=0x1e27; # small h with diaeresis
9541200ns$chars{0xc869}=0x00ef; # small i with diaeresis
9551100ns$chars{0xc86f}=0x00f6; # small o with diaeresis
9561100ns$chars{0xc874}=0x1e97; # small t with diaeresis
9571200ns$chars{0xc875}=0x00fc; # small u with diaeresis
9581200ns$chars{0xc877}=0x1e85; # small w with diaeresis
9591200ns$chars{0xc878}=0x1e8d; # small x with diaeresis
96012µs$chars{0xc879}=0x00ff; # small y with diaeresis
961 # 4/9 umlaut
9621200ns$chars{0xc920}=0x00a8; # [diaeresis]
9631100ns$chars{0xc961}=0x00e4; # a with umlaut
9641200ns$chars{0xc965}=0x00eb; # e with umlaut
9651200ns$chars{0xc969}=0x00ef; # i with umlaut
9661100ns$chars{0xc96f}=0x00f6; # o with umlaut
9671100ns$chars{0xc975}=0x00fc; # u with umlaut
968 # 4/10 circle above
9691100ns$chars{0xca41}=0x00c5; # capital a with ring above
9701200ns$chars{0xcaad}=0x016e; # capital u with ring above
9711100ns$chars{0xca61}=0x00e5; # small a with ring above
9721100ns$chars{0xca75}=0x016f; # small u with ring above
9731100ns$chars{0xca77}=0x1e98; # small w with ring above
9741100ns$chars{0xca79}=0x1e99; # small y with ring above
975 # 4/11 high comma off centre
976 # 4/12 inverted high comma centred
977 # 4/13 double acute accent
9781100ns$chars{0xcd4f}=0x0150; # capital o with double acute
9791200ns$chars{0xcd55}=0x0170; # capital u with double acute
9801200ns$chars{0xcd6f}=0x0151; # small o with double acute
9811100ns$chars{0xcd75}=0x0171; # small u with double acute
982 # 4/14 horn
9831100ns$chars{0xce54}=0x01a0; # latin capital letter o with horn
9841100ns$chars{0xce55}=0x01af; # latin capital letter u with horn
9851100ns$chars{0xce74}=0x01a1; # latin small letter o with horn
9861200ns$chars{0xce75}=0x01b0; # latin small letter u with horn
987 # 4/15 caron (hacek
9881100ns$chars{0xcf41}=0x01cd; # capital a with caron
9891200ns$chars{0xcf43}=0x010c; # capital c with caron
9901100ns$chars{0xcf44}=0x010e; # capital d with caron
9911200ns$chars{0xcf45}=0x011a; # capital e with caron
9921100ns$chars{0xcf47}=0x01e6; # capital g with caron
9931200ns$chars{0xcf49}=0x01cf; # capital i with caron
9941100ns$chars{0xcf4b}=0x01e8; # capital k with caron
9951200ns$chars{0xcf4c}=0x013d; # capital l with caron
9961100ns$chars{0xcf4e}=0x0147; # capital n with caron
9971200ns$chars{0xcf4f}=0x01d1; # capital o with caron
9981100ns$chars{0xcf52}=0x0158; # capital r with caron
9991200ns$chars{0xcf53}=0x0160; # capital s with caron
10001100ns$chars{0xcf54}=0x0164; # capital t with caron
10011200ns$chars{0xcf55}=0x01d3; # capital u with caron
10021100ns$chars{0xcf5a}=0x017d; # capital z with caron
10031200ns$chars{0xcf61}=0x01ce; # small a with caron
10041100ns$chars{0xcf63}=0x010d; # small c with caron
10051200ns$chars{0xcf64}=0x010f; # small d with caron
10061100ns$chars{0xcf65}=0x011b; # small e with caron
10071100ns$chars{0xcf67}=0x01e7; # small g with caron
10081100ns$chars{0xcf69}=0x01d0; # small i with caron
10091100ns$chars{0xcf6a}=0x01f0; # small j with caron
10101100ns$chars{0xcf6b}=0x01e9; # small k with caron
10111100ns$chars{0xcf6c}=0x013e; # small l with caron
10121100ns$chars{0xcf6e}=0x0148; # small n with caron
10131200ns$chars{0xcf6f}=0x01d2; # small o with caron
10141100ns$chars{0xcf72}=0x0159; # small r with caron
10151100ns$chars{0xcf73}=0x0161; # small s with caron
10161200ns$chars{0xcf74}=0x0165; # small t with caron
10171100ns$chars{0xcf75}=0x01d4; # small u with caron
10181200ns$chars{0xcf7a}=0x017e; # small z with caron
1019 # 5/0 cedilla
10201200ns$chars{0xd020}=0x00b8; # cedilla
10211100ns$chars{0xd043}=0x00c7; # capital c with cedilla
102215µs$chars{0xd044}=0x1e10; # capital d with cedilla
10231200ns$chars{0xd047}=0x0122; # capital g with cedilla
10241200ns$chars{0xd048}=0x1e28; # capital h with cedilla
10251100ns$chars{0xd04b}=0x0136; # capital k with cedilla
10261200ns$chars{0xd04c}=0x013b; # capital l with cedilla
10271100ns$chars{0xd04e}=0x0145; # capital n with cedilla
10281200ns$chars{0xd052}=0x0156; # capital r with cedilla
10291200ns$chars{0xd053}=0x015e; # capital s with cedilla
10301100ns$chars{0xd054}=0x0162; # capital t with cedilla
10311200ns$chars{0xd063}=0x00e7; # small c with cedilla
10321200ns$chars{0xd064}=0x1e11; # small d with cedilla
10331100ns$chars{0xd065}=0x0119; # small e with cedilla
10341200ns$chars{0xd067}=0x0123; # small g with cedilla
10351100ns$chars{0xd068}=0x1e29; # small h with cedilla
10361200ns$chars{0xd06b}=0x0137; # small k with cedilla
10371200ns$chars{0xd06c}=0x013c; # small l with cedilla
10381200ns$chars{0xd06e}=0x0146; # small n with cedilla
10391100ns$chars{0xd072}=0x0157; # small r with cedilla
10401100ns$chars{0xd073}=0x015f; # small s with cedilla
10411100ns$chars{0xd074}=0x0163; # small t with cedilla
1042 # 5/1 rude
1043 # 5/2 hook to left
1044 # 5/3 ogonek (hook to right
10451100ns$chars{0xd320}=0x02db; # ogonek
10461100ns$chars{0xd341}=0x0104; # capital a with ogonek
10471100ns$chars{0xd345}=0x0118; # capital e with ogonek
10481200ns$chars{0xd349}=0x012e; # capital i with ogonek
10491100ns$chars{0xd34f}=0x01ea; # capital o with ogonek
10501200ns$chars{0xd355}=0x0172; # capital u with ogonek
10511200ns$chars{0xd361}=0x0105; # small a with ogonek
10521100ns$chars{0xd365}=0x0119; # small e with ogonek
10531200ns$chars{0xd369}=0x012f; # small i with ogonek
10541100ns$chars{0xd36f}=0x01eb; # small o with ogonek
10551200ns$chars{0xd375}=0x0173; # small u with ogonek
1056 # 5/4 circle below
10571100ns$chars{0xd441}=0x1e00; # capital a with ring below
10581200ns$chars{0xd461}=0x1e01; # small a with ring below
1059 # 5/5 half circle below
10601100ns$chars{0xf948}=0x1e2a; # capital h with breve below
10611100ns$chars{0xf968}=0x1e2b; # small h with breve below
1062 # 5/6 dot below
10631100ns$chars{0xd641}=0x1ea0; # capital a with dot below
10641200ns$chars{0xd642}=0x1e04; # capital b with dot below
10651100ns$chars{0xd644}=0x1e0c; # capital d with dot below
10661200ns$chars{0xd645}=0x1eb8; # capital e with dot below
10671100ns$chars{0xd648}=0x1e24; # capital h with dot below
10681100ns$chars{0xd649}=0x1eca; # capital i with dot below
10691200ns$chars{0xd64b}=0x1e32; # capital k with dot below
10701100ns$chars{0xd64c}=0x1e36; # capital l with dot below
10711200ns$chars{0xd64d}=0x1e42; # capital m with dot below
10721100ns$chars{0xd64e}=0x1e46; # capital n with dot below
10731200ns$chars{0xd64f}=0x1ecc; # capital o with dot below
10741100ns$chars{0xd652}=0x1e5a; # capital r with dot below
10751200ns$chars{0xd653}=0x1e62; # capital s with dot below
10761100ns$chars{0xd654}=0x1e6c; # capital t with dot below
10771200ns$chars{0xd655}=0x1ee4; # capital u with dot below
10781100ns$chars{0xd656}=0x1e7e; # capital v with dot below
10791100ns$chars{0xd657}=0x1e88; # capital w with dot below
10801200ns$chars{0xd659}=0x1ef4; # capital y with dot below
10811100ns$chars{0xd65a}=0x1e92; # capital z with dot below
10821100ns$chars{0xd661}=0x1ea1; # small a with dot below
10831100ns$chars{0xd662}=0x1e05; # small b with dot below
10841100ns$chars{0xd664}=0x1e0d; # small d with dot below
10851200ns$chars{0xd665}=0x1eb9; # small e with dot below
10861100ns$chars{0xd668}=0x1e25; # small h with dot below
10871100ns$chars{0xd669}=0x1ecb; # small i with dot below
10881100ns$chars{0xd66b}=0x1e33; # small k with dot below
10891100ns$chars{0xd66c}=0x1e37; # small l with dot below
10901100ns$chars{0xd66d}=0x1e43; # small m with dot below
10911200ns$chars{0xd66e}=0x1e47; # small n with dot below
10921100ns$chars{0xd66f}=0x1ecd; # small o with dot below
10931100ns$chars{0xd672}=0x1e5b; # small r with dot below
10941100ns$chars{0xd673}=0x1e63; # small s with dot below
10951200ns$chars{0xd674}=0x1e6d; # small t with dot below
10961100ns$chars{0xd675}=0x1ee5; # small u with dot below
10971100ns$chars{0xd676}=0x1e7f; # small v with dot below
10981100ns$chars{0xd677}=0x1e89; # small w with dot below
10991200ns$chars{0xd679}=0x1ef5; # small y with dot below
11001100ns$chars{0xd67a}=0x1e93; # small z with dot below
1101 # 5/7 double dot below
11021200ns$chars{0xd755}=0x1e72; # capital u with diaeresis below
11031100ns$chars{0xd775}=0x1e73; # small u with diaeresis below
1104 # 5/8 underline
11051200ns$chars{0xd820}=0x005f; # underline
1106 # 5/9 double underline
11071100ns$chars{0xd920}=0x2017; # double underline
1108 # 5/10 small low vertical bar
11091200ns$chars{0xda20}=0x02cc; #
1110 # 5/11 circumflex below
1111 # 5/12 (this position shall not be used)
1112 # 5/13 left half of ligature sign and of double tilde
1113 # 5/14 right half of ligature sign
1114 # 5/15 right half of double tilde
1115# map {printf "%x :%x\n",$_,$chars{$_};}keys %chars;
1116
1117sub char_decode5426 {
1118 my ( $string) = @_;
1119 my $result;
1120
1121 my @data = unpack("C*", $string);
1122 my @characters;
1123 my $length=scalar(@data);
1124 for (my $i = 0; $i < scalar(@data); $i++) {
1125 my $char= $data[$i];
1126 if ($char >= 0x00 && $char <= 0x7F){
1127 #IsAscii
1128
1129 push @characters,$char unless ($char<0x02 ||$char== 0x0F);
1130 }elsif (($char >= 0xC0 && $char <= 0xDF)) {
1131 #Combined Char
1132 my $convchar ;
1133 if ($chars{$char*256+$data[$i+1]}) {
1134 $convchar= $chars{$char * 256 + $data[$i+1]};
1135 $i++;
1136# printf "char %x $char, char to convert %x , converted %x\n",$char,$char * 256 + $data[$i - 1],$convchar;
1137 } elsif ($chars{$char}) {
1138 $convchar= $chars{$char};
1139# printf "0xC char %x, converted %x\n",$char,$chars{$char};
1140 }else {
1141 $convchar=$char;
1142 }
1143 push @characters,$convchar;
1144 } else {
1145 my $convchar;
1146 if ($chars{$char}) {
1147 $convchar= $chars{$char};
1148# printf "char %x, converted %x\n",$char,$chars{$char};
1149 }else {
1150# printf "char %x $char\n",$char;
1151 $convchar=$char;
1152 }
1153 push @characters,$convchar;
1154 }
1155 }
1156 $result=pack "U*",@characters;
1157# $result=~s/\x01//;
1158# $result=~s/\x00//;
1159 $result=~s/\x0f//;
1160 $result=~s/\x1b.//;
1161 $result=~s/\x0e//;
1162 $result=~s/\x1b\x5b//;
1163# map{printf "%x",$_} @characters;
1164# printf "\n";
1165 return $result;
1166}
1167
1168145µs1;
1169
1170
1171=head1 AUTHOR
1172
1173Koha Development Team <http://koha-community.org/>
1174
1175Galen Charlton <galen.charlton@liblime.com>
1176
1177=cut