← Index
NYTProf Performance Profile   « block view • line view • sub view »
For /usr/share/koha/opac/cgi-bin/opac/opac-search.pl
  Run on Tue Oct 15 17:10:45 2013
Reported on Tue Oct 15 17:12:41 2013

Filename/usr/share/koha/lib/C4/Charset.pm
StatementsExecuted 12360 statements in 65.9ms
Subroutines
Calls P F Exclusive
Time
Inclusive
Time
Subroutine
251125.4ms114msC4::Charset::::SetUTF8FlagC4::Charset::SetUTF8Flag
13191116.1ms27.1msC4::Charset::::NormalizeStringC4::Charset::NormalizeString
25119.22ms9.47msC4::Charset::::CORE:substC4::Charset::CORE:subst (opcode)
2511582µs10.0msC4::Charset::::StripNonXmlCharsC4::Charset::StripNonXmlChars
111340µs1.03msC4::Charset::::BEGIN@24C4::Charset::BEGIN@24
11133µs33µsC4::Charset::::BEGIN@30C4::Charset::BEGIN@30
11126µs105µsC4::Charset::::BEGIN@26C4::Charset::BEGIN@26
11124µs200µsC4::Charset::::BEGIN@25C4::Charset::BEGIN@25
11121µs29µsC4::Charset::::BEGIN@20C4::Charset::BEGIN@20
11118µs185µsC4::Charset::::BEGIN@28C4::Charset::BEGIN@28
11115µs59µsC4::Charset::::BEGIN@23C4::Charset::BEGIN@23
11113µs38µsC4::Charset::::BEGIN@21C4::Charset::BEGIN@21
0000s0sC4::Charset::::IsStringUTF8ishC4::Charset::IsStringUTF8ish
0000s0sC4::Charset::::MarcToUTF8RecordC4::Charset::MarcToUTF8Record
0000s0sC4::Charset::::SetMarcUnicodeFlagC4::Charset::SetMarcUnicodeFlag
0000s0sC4::Charset::::__ANON__[:542]C4::Charset::__ANON__[:542]
0000s0sC4::Charset::::_default_marc21_charconv_to_utf8C4::Charset::_default_marc21_charconv_to_utf8
0000s0sC4::Charset::::_default_unimarc_charconv_to_utf8C4::Charset::_default_unimarc_charconv_to_utf8
0000s0sC4::Charset::::_marc_iso5426_to_utf8C4::Charset::_marc_iso5426_to_utf8
0000s0sC4::Charset::::_marc_marc8_to_utf8C4::Charset::_marc_marc8_to_utf8
0000s0sC4::Charset::::_marc_to_utf8_replacement_charC4::Charset::_marc_to_utf8_replacement_char
0000s0sC4::Charset::::_marc_to_utf8_via_text_iconvC4::Charset::_marc_to_utf8_via_text_iconv
0000s0sC4::Charset::::char_decode5426C4::Charset::char_decode5426
0000s0sC4::Charset::::nsb_cleanC4::Charset::nsb_clean
Call graph for these subroutines as a Graphviz dot language file.
Line State
ments
Time
on line
Calls Time
in subs
Code
1package C4::Charset;
2
3# Copyright (C) 2008 LibLime
4#
5# This file is part of Koha.
6#
7# Koha is free software; you can redistribute it and/or modify it under the
8# terms of the GNU General Public License as published by the Free Software
9# Foundation; either version 2 of the License, or (at your option) any later
10# version.
11#
12# Koha is distributed in the hope that it will be useful, but WITHOUT ANY
13# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14# A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15#
16# You should have received a copy of the GNU General Public License along
17# with Koha; if not, write to the Free Software Foundation, Inc.,
18# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19
20333µs236µs
# spent 29µs (21+7) within C4::Charset::BEGIN@20 which was called: # once (21µs+7µs) by C4::Biblio::BEGIN@37 at line 20
use strict;
# spent 29µs making 1 call to C4::Charset::BEGIN@20 # spent 7µs making 1 call to strict::import
21334µs263µs
# spent 38µs (13+25) within C4::Charset::BEGIN@21 which was called: # once (13µs+25µs) by C4::Biblio::BEGIN@37 at line 21
use warnings;
# spent 38µs making 1 call to C4::Charset::BEGIN@21 # spent 25µs making 1 call to warnings::import
22
23336µs2103µs
# spent 59µs (15+44) within C4::Charset::BEGIN@23 which was called: # once (15µs+44µs) by C4::Biblio::BEGIN@37 at line 23
use MARC::Charset qw/marc8_to_utf8/;
# spent 59µs making 1 call to C4::Charset::BEGIN@23 # spent 44µs making 1 call to Exporter::import
243149µs21.06ms
# spent 1.03ms (340µs+693µs) within C4::Charset::BEGIN@24 which was called: # once (340µs+693µs) by C4::Biblio::BEGIN@37 at line 24
use Text::Iconv;
# spent 1.03ms making 1 call to C4::Charset::BEGIN@24 # spent 27µs making 1 call to Exporter::import
25365µs2376µs
# spent 200µs (24+176) within C4::Charset::BEGIN@25 which was called: # once (24µs+176µs) by C4::Biblio::BEGIN@37 at line 25
use C4::Debug;
# spent 200µs making 1 call to C4::Charset::BEGIN@25 # spent 176µs making 1 call to Exporter::import
26375µs2184µs
# spent 105µs (26+79) within C4::Charset::BEGIN@26 which was called: # once (26µs+79µs) by C4::Biblio::BEGIN@37 at line 26
use Unicode::Normalize;
# spent 105µs making 1 call to C4::Charset::BEGIN@26 # spent 79µs making 1 call to Exporter::import
27
283117µs2352µs
# spent 185µs (18+167) within C4::Charset::BEGIN@28 which was called: # once (18µs+167µs) by C4::Biblio::BEGIN@37 at line 28
use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
# spent 185µs making 1 call to C4::Charset::BEGIN@28 # spent 167µs making 1 call to vars::import
29
30
# spent 33µs within C4::Charset::BEGIN@30 which was called: # once (33µs+0s) by C4::Biblio::BEGIN@37 at line 44
BEGIN {
31 # set the version for version checking
3213µs $VERSION = 3.07.00.049;
3312µs require Exporter;
34116µs @ISA = qw(Exporter);
35112µs @EXPORT = qw(
36 NormalizeString
37 IsStringUTF8ish
38 MarcToUTF8Record
39 SetUTF8Flag
40 SetMarcUnicodeFlag
41 StripNonXmlChars
42 nsb_clean
43 );
4416.06ms133µs}
# spent 33µs making 1 call to C4::Charset::BEGIN@30
45
46=head1 NAME
47
- -
77=head1 FUNCTIONS
78
- -
107sub IsStringUTF8ish {
108 my $str = shift;
109
110 return 1 if utf8::is_utf8($str);
111 return utf8::decode($str);
112}
113
114=head2 SetUTF8Flag
115
- -
133
# spent 114ms (25.4+88.6) within C4::Charset::SetUTF8Flag which was called 25 times, avg 4.56ms/call: # 25 times (25.4ms+88.6ms) by C4::Search::searchResults at line 2011 of /usr/share/koha/lib/C4/Search.pm, avg 4.56ms/call
sub SetUTF8Flag{
1342545µs my ($record, $nfd)=@_;
13525191µs25176µs return unless ($record && $record->fields());
# spent 176µs making 25 calls to MARC::Record::fields, avg 7µs/call
13625196µs25180µs foreach my $field ($record->fields()){
# spent 180µs making 25 calls to MARC::Record::fields, avg 7µs/call
1375443.71ms5442.54ms if ($field->tag()>=10){
# spent 2.54ms making 544 calls to MARC::Field::tag, avg 5µs/call
138512230µs my @subfields;
1395122.17ms51211.1ms foreach my $subfield ($field->subfields()){
# spent 11.1ms making 512 calls to MARC::Field::subfields, avg 22µs/call
14013195.57ms131927.1ms push @subfields,($$subfield[0],NormalizeString($$subfield[1],$nfd));
# spent 27.1ms making 1319 calls to C4::Charset::NormalizeString, avg 21µs/call
141 }
1425125.94ms204838.7ms my $newfield=MARC::Field->new(
# spent 26.4ms making 512 calls to MARC::Field::new, avg 52µs/call # spent 10.0ms making 1024 calls to MARC::Field::indicator, avg 10µs/call # spent 2.24ms making 512 calls to MARC::Field::tag, avg 4µs/call
143 $field->tag(),
144 $field->indicator(1),
145 $field->indicator(2),
146 @subfields
147 );
1485122.87ms5128.87ms $field->replace_with($newfield);
# spent 8.87ms making 512 calls to MARC::Field::replace_with, avg 17µs/call
149 }
150 }
151}
152
153=head2 NormalizeString
154
- -
173
# spent 27.1ms (16.1+11.0) within C4::Charset::NormalizeString which was called 1319 times, avg 21µs/call: # 1319 times (16.1ms+11.0ms) by C4::Charset::SetUTF8Flag at line 140, avg 21µs/call
sub NormalizeString{
17413191.10ms my ($string,$nfd,$transform)=@_;
17513198.82ms26382.95ms utf8::decode($string) unless (utf8::is_utf8($string));
# spent 1.70ms making 1319 calls to utf8::decode, avg 1µs/call # spent 1.25ms making 1319 calls to utf8::is_utf8, avg 945ns/call
17613191.17ms if ($nfd){
177 $string= NFD($string);
178 }
179 else {
180131912.0ms13198.06ms $string=NFC($string);
# spent 8.06ms making 1319 calls to Unicode::Normalize::NFC, avg 6µs/call
181 }
1821319562µs if ($transform){
183 $string=~s/\<|\>|\^|\;|\.|\?|,|\-|\(|\)|\[|\]|\{|\}|\$|\%|\!|\*|\:|\\|\/|\&|\"|\'/ /g;
184 #removing one letter words "d'" "l'" was changed into "d " "l "
185 $string=~s/\b\S\b//g;
186 $string=~s/\s+$//g;
187 }
18813194.12ms return $string;
189}
190
191=head2 MarcToUTF8Record
192
- -
212sub MarcToUTF8Record {
213 my $marc = shift;
214 my $marc_flavour = shift;
215 my $source_encoding = shift;
216 my $marc_record;
217 my $marc_blob_is_utf8 = 0;
218 if (ref($marc) eq 'MARC::Record') {
219 my $marc_blob = $marc->as_usmarc();
220 $marc_blob_is_utf8 = IsStringUTF8ish($marc_blob);
221 $marc_record = $marc;
222 } else {
223 # dealing with a MARC blob
224
225 # remove any ersatz whitespace from the beginning and
226 # end of the MARC blob -- these can creep into MARC
227 # files produced by several sources -- caller really
228 # should be doing this, however
229 $marc =~ s/^\s+//;
230 $marc =~ s/\s+$//;
231 $marc_blob_is_utf8 = IsStringUTF8ish($marc);
232 eval {
233 $marc_record = MARC::Record->new_from_usmarc($marc);
234 };
235 if ($@) {
236 # if we fail the first time, one likely problem
237 # is that we have a MARC21 record that says that it's
238 # UTF-8 (Leader/09 = 'a') but contains non-UTF-8 characters.
239 # We'll try parsing it again.
240 substr($marc, 9, 1) = ' ';
241 eval {
242 $marc_record = MARC::Record->new_from_usmarc($marc);
243 };
244 if ($@) {
245 # it's hopeless; return an empty MARC::Record
246 return MARC::Record->new(), 'failed', ['could not parse MARC blob'];
247 }
248 }
249 }
250
251 # If we do not know the source encoding, try some guesses
252 # as follows:
253 # 1. Record is UTF-8 already.
254 # 2. If MARC flavor is MARC21 or NORMARC, then
255 # a. record is MARC-8
256 # b. record is ISO-8859-1
257 # 3. If MARC flavor is UNIMARC, then
258 if (not defined $source_encoding) {
259 if ($marc_blob_is_utf8) {
260 # note that for MARC21/NORMARC we are not bothering to check
261 # if the Leader/09 is set to 'a' or not -- because
262 # of problems with various ILSs (including Koha in the
263 # past, alas), this just is not trustworthy.
264 SetMarcUnicodeFlag($marc_record, $marc_flavour);
265 return $marc_record, 'UTF-8', [];
266 } else {
267 if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
268 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
269 } elsif ($marc_flavour =~/UNIMARC/) {
270 return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour);
271 } else {
272 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
273 }
274 }
275 } else {
276 # caller knows the character encoding
277 my $original_marc_record = $marc_record->clone();
278 my @errors;
279 if ($source_encoding =~ /utf-?8/i) {
280 if ($marc_blob_is_utf8) {
281 SetMarcUnicodeFlag($marc_record, $marc_flavour);
282 return $marc_record, 'UTF-8', [];
283 } else {
284 push @errors, 'specified UTF-8 => UTF-8 conversion, but record is not in UTF-8';
285 }
286 } elsif ($source_encoding =~ /marc-?8/i) {
287 @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour);
288 } elsif ($source_encoding =~ /5426/) {
289 @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour);
290 } else {
291 # assume any other character encoding is for Text::Iconv
292 @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
293 }
294
295 if (@errors) {
296 _marc_to_utf8_replacement_char($original_marc_record, $marc_flavour);
297 return $original_marc_record, 'failed', \@errors;
298 } else {
299 return $marc_record, $source_encoding, [];
300 }
301 }
302
303}
304
305=head2 SetMarcUnicodeFlag
306
- -
317sub SetMarcUnicodeFlag {
318 my $marc_record = shift;
319 my $marc_flavour = shift; # || C4::Context->preference("marcflavour");
320
321 $marc_record->encoding('UTF-8');
322 if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
323 my $leader = $marc_record->leader();
324 substr($leader, 9, 1) = 'a';
325 $marc_record->leader($leader);
326 } elsif ($marc_flavour =~/UNIMARC/) {
327 my $defaultlanguage = C4::Context->preference("UNIMARCField100Language");
328 $defaultlanguage = "fre" if (!$defaultlanguage || length($defaultlanguage) != 3);
329 my $string;
330 my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?(21,12):(36,25));
331 $string=$marc_record->subfield( 100, "a" );
332 if (defined $string && length($string)==$subflength) {
333 $string = substr $string, 0,$subflength if (length($string)>$subflength);
334 }
335 else {
336 $string = POSIX::strftime( "%Y%m%d", localtime );
337 $string =~ s/\-//g;
338 $string = sprintf( "%-*s", $subflength, $string );
339 substr ( $string, ($encodingposition - 3), 3, $defaultlanguage);
340 }
341 substr( $string, $encodingposition, 3, "y50" );
342 if ( $marc_record->subfield( 100, "a" ) ) {
343 $marc_record->field('100')->update(a=>$string);
344 }
345 else {
346 $marc_record->insert_grouped_field(
347 MARC::Field->new( 100, '', '', "a" => $string ) );
348 }
349 $debug && warn "encodage: ", substr( $marc_record->subfield(100, 'a'), $encodingposition, 3 );
350 } else {
351 warn "Unrecognized marcflavour: $marc_flavour";
352 }
353}
354
355=head2 StripNonXmlChars
356
- -
380
# spent 10.0ms (582µs+9.47) within C4::Charset::StripNonXmlChars which was called 25 times, avg 402µs/call: # 25 times (582µs+9.47ms) by C4::Biblio::GetMarcBiblio at line 1258 of /usr/share/koha/lib/C4/Biblio.pm, avg 402µs/call
sub StripNonXmlChars {
3812561µs my $str = shift;
3822550µs if (!defined($str) || $str eq ""){
383 return "";
384 }
385259.43ms269.71ms $str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g;
# spent 9.47ms making 25 calls to C4::Charset::CORE:subst, avg 379µs/call # spent 246µs making 1 call to utf8::SWASHNEW
38625211µs return $str;
387}
388
- -
391=head2 nsb_clean
392
- -
402sub nsb_clean {
403 my $NSB = '\x88' ; # NSB : begin Non Sorting Block
404 my $NSE = '\x89' ; # NSE : Non Sorting Block end
405 my $NSB2 = '\x98' ; # NSB : begin Non Sorting Block
406 my $NSE2 = '\x9C' ; # NSE : Non Sorting Block end
407 my $C2 = '\xC2' ; # What is this char ? It is sometimes left by the regexp after removing NSB / NSE
408
409 # handles non sorting blocks
410 my ($string) = @_ ;
411 $_ = $string ;
412 s/$NSB//g ;
413 s/$NSE//g ;
414 s/$NSB2//g ;
415 s/$NSE2//g ;
416 s/$C2//g ;
417 $string = $_ ;
418
419 return($string) ;
420}
421
422
423=head1 INTERNAL FUNCTIONS
424
- -
441sub _default_marc21_charconv_to_utf8 {
442 my $marc_record = shift;
443 my $marc_flavour = shift;
444
445 my $trial_marc8 = $marc_record->clone();
446 my @all_errors = ();
447 my @errors = _marc_marc8_to_utf8($trial_marc8, $marc_flavour);
448 unless (@errors) {
449 return $trial_marc8, 'MARC-8', [];
450 }
451 push @all_errors, @errors;
452
453 my $trial_8859_1 = $marc_record->clone();
454 @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
455 unless (@errors) {
456 return $trial_8859_1, 'iso-8859-1', []; # note -- we could return \@all_errors
457 # instead if we wanted to report details
458 # of the failed attempt at MARC-8 => UTF-8
459 }
460 push @all_errors, @errors;
461
462 my $default_converted = $marc_record->clone();
463 _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
464 return $default_converted, 'failed', \@all_errors;
465}
466
467=head2 _default_unimarc_charconv_to_utf8
468
- -
483sub _default_unimarc_charconv_to_utf8 {
484 my $marc_record = shift;
485 my $marc_flavour = shift;
486
487 my $trial_marc8 = $marc_record->clone();
488 my @all_errors = ();
489 my @errors = _marc_iso5426_to_utf8($trial_marc8, $marc_flavour);
490 unless (@errors) {
491 return $trial_marc8, 'iso-5426';
492 }
493 push @all_errors, @errors;
494
495 my $trial_8859_1 = $marc_record->clone();
496 @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
497 unless (@errors) {
498 return $trial_8859_1, 'iso-8859-1';
499 }
500 push @all_errors, @errors;
501
502 my $default_converted = $marc_record->clone();
503 _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
504 return $default_converted, 'failed', \@all_errors;
505}
506
507=head2 _marc_marc8_to_utf8
508
- -
518sub _marc_marc8_to_utf8 {
519 my $marc_record = shift;
520 my $marc_flavour = shift;
521
522 my $prev_ignore = MARC::Charset->ignore_errors();
523 MARC::Charset->ignore_errors(1);
524
525 # trap warnings raised by MARC::Charset
526 my @errors = ();
527 local $SIG{__WARN__} = sub {
528 my $msg = $_[0];
529 if ($msg =~ /MARC.Charset/) {
530 # FIXME - purpose of this regexp is to strip out the
531 # line reference to MARC/Charset.pm, but as it
532 # exists probably won't work quite on Windows --
533 # some sort of minimal-bunch back-tracking RE
534 # would be helpful here
535 $msg =~ s/at [\/].*?.MARC.Charset\.pm line \d+\.\n$//;
536 push @errors, $msg;
537 } else {
538 # if warning doesn't come from MARC::Charset, just
539 # pass it on
540 warn $msg;
541 }
542 };
543
544 foreach my $field ($marc_record->fields()) {
545 if ($field->is_control_field()) {
546 ; # do nothing -- control fields should not contain non-ASCII characters
547 } else {
548 my @converted_subfields;
549 foreach my $subfield ($field->subfields()) {
550 my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]);
551 unless (IsStringUTF8ish($utf8sf)) {
552 # Because of a bug in MARC::Charset 0.98, if the string
553 # has (a) one or more diacritics that (b) are only in character positions
554 # 128 to 255 inclusive, the resulting converted string is not in
555 # UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1). If that
556 # occurs, upgrade the string in place. Moral of the story seems to be
557 # that pack("U", ...) is better than chr(...) if you need to guarantee
558 # that the resulting string is UTF-8.
559 utf8::upgrade($utf8sf);
560 }
561 push @converted_subfields, $subfield->[0], $utf8sf;
562 }
563
564 $field->replace_with(MARC::Field->new(
565 $field->tag(), $field->indicator(1), $field->indicator(2),
566 @converted_subfields)
567 );
568 }
569 }
570
571 MARC::Charset->ignore_errors($prev_ignore);
572
573 SetMarcUnicodeFlag($marc_record, $marc_flavour);
574
575 return @errors;
576}
577
578=head2 _marc_iso5426_to_utf8
579
- -
592sub _marc_iso5426_to_utf8 {
593 my $marc_record = shift;
594 my $marc_flavour = shift;
595
596 my @errors = ();
597
598 foreach my $field ($marc_record->fields()) {
599 if ($field->is_control_field()) {
600 ; # do nothing -- control fields should not contain non-ASCII characters
601 } else {
602 my @converted_subfields;
603 foreach my $subfield ($field->subfields()) {
604 my $utf8sf = char_decode5426($subfield->[1]);
605 push @converted_subfields, $subfield->[0], $utf8sf;
606 }
607
608 $field->replace_with(MARC::Field->new(
609 $field->tag(), $field->indicator(1), $field->indicator(2),
610 @converted_subfields)
611 );
612 }
613 }
614
615 SetMarcUnicodeFlag($marc_record, $marc_flavour);
616
617 return @errors;
618}
619
620=head2 _marc_to_utf8_via_text_iconv
621
- -
634sub _marc_to_utf8_via_text_iconv {
635 my $marc_record = shift;
636 my $marc_flavour = shift;
637 my $source_encoding = shift;
638
639 my @errors = ();
640 my $decoder;
641 eval { $decoder = Text::Iconv->new($source_encoding, 'utf8'); };
642 if ($@) {
643 push @errors, "Could not initialze $source_encoding => utf8 converter: $@";
644 return @errors;
645 }
646
647 my $prev_raise_error = Text::Iconv->raise_error();
648 Text::Iconv->raise_error(1);
649
650 foreach my $field ($marc_record->fields()) {
651 if ($field->is_control_field()) {
652 ; # do nothing -- control fields should not contain non-ASCII characters
653 } else {
654 my @converted_subfields;
655 foreach my $subfield ($field->subfields()) {
656 my $converted_value;
657 my $conversion_ok = 1;
658 eval { $converted_value = $decoder->convert($subfield->[1]); };
659 if ($@) {
660 $conversion_ok = 0;
661 push @errors, $@;
662 } elsif (not defined $converted_value) {
663 $conversion_ok = 0;
664 push @errors, "Text::Iconv conversion failed - retval is " . $decoder->retval();
665 }
666
667 if ($conversion_ok) {
668 push @converted_subfields, $subfield->[0], $converted_value;
669 } else {
670 $converted_value = $subfield->[1];
671 $converted_value =~ s/[\200-\377]/\xef\xbf\xbd/g;
672 push @converted_subfields, $subfield->[0], $converted_value;
673 }
674 }
675
676 $field->replace_with(MARC::Field->new(
677 $field->tag(), $field->indicator(1), $field->indicator(2),
678 @converted_subfields)
679 );
680 }
681 }
682
683 SetMarcUnicodeFlag($marc_record, $marc_flavour);
684 Text::Iconv->raise_error($prev_raise_error);
685
686 return @errors;
687}
688
689=head2 _marc_to_utf8_replacement_char
690
- -
703sub _marc_to_utf8_replacement_char {
704 my $marc_record = shift;
705 my $marc_flavour = shift;
706
707 foreach my $field ($marc_record->fields()) {
708 if ($field->is_control_field()) {
709 ; # do nothing -- control fields should not contain non-ASCII characters
710 } else {
711 my @converted_subfields;
712 foreach my $subfield ($field->subfields()) {
713 my $value = $subfield->[1];
714 $value =~ s/[\200-\377]/\xef\xbf\xbd/g;
715 push @converted_subfields, $subfield->[0], $value;
716 }
717
718 $field->replace_with(MARC::Field->new(
719 $field->tag(), $field->indicator(1), $field->indicator(2),
720 @converted_subfields)
721 );
722 }
723 }
724
725 SetMarcUnicodeFlag($marc_record, $marc_flavour);
726}
727
728=head2 char_decode5426
729
- -
7371900nsmy %chars;
73818µs$chars{0xb0}=0x0101;#3/0ayn[ain]
73913µs$chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove]
740#$chars{0xb2}=0x00e0;#'à';
74112µs$chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark
742#$chars{0xb3}=0x00e7;#'ç';
74312µs$chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark
744# $chars{0xb4}='è';
74511µs$chars{0xb4}=0x00e8;
74611µs$chars{0xbd}=0x02b9;
74712µs$chars{0xbe}=0x02ba;
748# $chars{0xb5}='é';
74912µs$chars{0xb5}=0x00e9;
75013µs$chars{0x97}=0x003c;#3/2leftlowsinglequotationmark
75111µs$chars{0x98}=0x003e;#3/2leftlowsinglequotationmark
75211µs$chars{0xfa}=0x0153; #oe
75313µs$chars{0xea}=0x0152; #oe
75412µs$chars{0x81d1}=0x00b0;
755
756####
757## combined characters iso5426
758
75911µs$chars{0xc041}=0x1ea2; # capital a with hook above
76011µs$chars{0xc045}=0x1eba; # capital e with hook above
76111µs$chars{0xc049}=0x1ec8; # capital i with hook above
76212µs$chars{0xc04f}=0x1ece; # capital o with hook above
76312µs$chars{0xc055}=0x1ee6; # capital u with hook above
76412µs$chars{0xc059}=0x1ef6; # capital y with hook above
76511µs$chars{0xc061}=0x1ea3; # small a with hook above
76612µs$chars{0xc065}=0x1ebb; # small e with hook above
76711µs$chars{0xc069}=0x1ec9; # small i with hook above
76812µs$chars{0xc06f}=0x1ecf; # small o with hook above
76912µs$chars{0xc075}=0x1ee7; # small u with hook above
77012µs$chars{0xc079}=0x1ef7; # small y with hook above
771
772 # 4/1 grave accent
77312µs$chars{0xc141}=0x00c0; # capital a with grave accent
77419µs$chars{0xc145}=0x00c8; # capital e with grave accent
77513µs$chars{0xc149}=0x00cc; # capital i with grave accent
77611µs$chars{0xc14f}=0x00d2; # capital o with grave accent
77712µs$chars{0xc155}=0x00d9; # capital u with grave accent
77812µs$chars{0xc157}=0x1e80; # capital w with grave
77913µs$chars{0xc159}=0x1ef2; # capital y with grave
78012µs$chars{0xc161}=0x00e0; # small a with grave accent
78112µs$chars{0xc165}=0x00e8; # small e with grave accent
78212µs$chars{0xc169}=0x00ec; # small i with grave accent
78312µs$chars{0xc16f}=0x00f2; # small o with grave accent
78412µs$chars{0xc175}=0x00f9; # small u with grave accent
78512µs$chars{0xc177}=0x1e81; # small w with grave
78612µs$chars{0xc179}=0x1ef3; # small y with grave
787 # 4/2 acute accent
78812µs$chars{0xc241}=0x00c1; # capital a with acute accent
78912µs$chars{0xc243}=0x0106; # capital c with acute accent
79011µs$chars{0xc245}=0x00c9; # capital e with acute accent
79112µs$chars{0xc247}=0x01f4; # capital g with acute
79212µs$chars{0xc249}=0x00cd; # capital i with acute accent
79312µs$chars{0xc24b}=0x1e30; # capital k with acute
79413µs$chars{0xc24c}=0x0139; # capital l with acute accent
79511µs$chars{0xc24d}=0x1e3e; # capital m with acute
79613µs$chars{0xc24e}=0x0143; # capital n with acute accent
79711µs$chars{0xc24f}=0x00d3; # capital o with acute accent
79812µs$chars{0xc250}=0x1e54; # capital p with acute
79912µs$chars{0xc252}=0x0154; # capital r with acute accent
80011µs$chars{0xc253}=0x015a; # capital s with acute accent
80111µs$chars{0xc255}=0x00da; # capital u with acute accent
80211µs$chars{0xc257}=0x1e82; # capital w with acute
80311µs$chars{0xc259}=0x00dd; # capital y with acute accent
80411µs$chars{0xc25a}=0x0179; # capital z with acute accent
80512µs$chars{0xc261}=0x00e1; # small a with acute accent
80611µs$chars{0xc263}=0x0107; # small c with acute accent
80711µs$chars{0xc265}=0x00e9; # small e with acute accent
80811µs$chars{0xc267}=0x01f5; # small g with acute
80912µs$chars{0xc269}=0x00ed; # small i with acute accent
81012µs$chars{0xc26b}=0x1e31; # small k with acute
81112µs$chars{0xc26c}=0x013a; # small l with acute accent
81219µs$chars{0xc26d}=0x1e3f; # small m with acute
81312µs$chars{0xc26e}=0x0144; # small n with acute accent
81412µs$chars{0xc26f}=0x00f3; # small o with acute accent
81511µs$chars{0xc270}=0x1e55; # small p with acute
81611µs$chars{0xc272}=0x0155; # small r with acute accent
81711µs$chars{0xc273}=0x015b; # small s with acute accent
81812µs$chars{0xc275}=0x00fa; # small u with acute accent
81911µs$chars{0xc277}=0x1e83; # small w with acute
82011µs$chars{0xc279}=0x00fd; # small y with acute accent
82111µs$chars{0xc27a}=0x017a; # small z with acute accent
82212µs$chars{0xc2e1}=0x01fc; # capital ae with acute
82311µs$chars{0xc2f1}=0x01fd; # small ae with acute
824 # 4/3 circumflex accent
82512µs$chars{0xc341}=0x00c2; # capital a with circumflex accent
82611µs$chars{0xc343}=0x0108; # capital c with circumflex
82712µs$chars{0xc345}=0x00ca; # capital e with circumflex accent
82812µs$chars{0xc347}=0x011c; # capital g with circumflex
82911µs$chars{0xc348}=0x0124; # capital h with circumflex
83011µs$chars{0xc349}=0x00ce; # capital i with circumflex accent
83112µs$chars{0xc34a}=0x0134; # capital j with circumflex
83211µs$chars{0xc34f}=0x00d4; # capital o with circumflex accent
83312µs$chars{0xc353}=0x015c; # capital s with circumflex
83411µs$chars{0xc355}=0x00db; # capital u with circumflex
83512µs$chars{0xc357}=0x0174; # capital w with circumflex
83612µs$chars{0xc359}=0x0176; # capital y with circumflex
83711µs$chars{0xc35a}=0x1e90; # capital z with circumflex
83811µs$chars{0xc361}=0x00e2; # small a with circumflex accent
83912µs$chars{0xc363}=0x0109; # small c with circumflex
84012µs$chars{0xc365}=0x00ea; # small e with circumflex accent
84111µs$chars{0xc367}=0x011d; # small g with circumflex
84212µs$chars{0xc368}=0x0125; # small h with circumflex
84312µs$chars{0xc369}=0x00ee; # small i with circumflex accent
84412µs$chars{0xc36a}=0x0135; # small j with circumflex
84512µs$chars{0xc36e}=0x00f1; # small n with tilde
84612µs$chars{0xc36f}=0x00f4; # small o with circumflex accent
84712µs$chars{0xc373}=0x015d; # small s with circumflex
84812µs$chars{0xc375}=0x00fb; # small u with circumflex
84912µs$chars{0xc377}=0x0175; # small w with circumflex
85012µs$chars{0xc379}=0x0177; # small y with circumflex
85111µs$chars{0xc37a}=0x1e91; # small z with circumflex
852 # 4/4 tilde
85312µs$chars{0xc441}=0x00c3; # capital a with tilde
85411µs$chars{0xc445}=0x1ebc; # capital e with tilde
85511µs$chars{0xc449}=0x0128; # capital i with tilde
85611µs$chars{0xc44e}=0x00d1; # capital n with tilde
85711µs$chars{0xc44f}=0x00d5; # capital o with tilde
85812µs$chars{0xc455}=0x0168; # capital u with tilde
85911µs$chars{0xc456}=0x1e7c; # capital v with tilde
86012µs$chars{0xc459}=0x1ef8; # capital y with tilde
86112µs$chars{0xc461}=0x00e3; # small a with tilde
86212µs$chars{0xc465}=0x1ebd; # small e with tilde
86318µs$chars{0xc469}=0x0129; # small i with tilde
86412µs$chars{0xc46e}=0x00f1; # small n with tilde
86511µs$chars{0xc46f}=0x00f5; # small o with tilde
86611µs$chars{0xc475}=0x0169; # small u with tilde
86712µs$chars{0xc476}=0x1e7d; # small v with tilde
86811µs$chars{0xc479}=0x1ef9; # small y with tilde
869 # 4/5 macron
87012µs$chars{0xc541}=0x0100; # capital a with macron
87112µs$chars{0xc545}=0x0112; # capital e with macron
87212µs$chars{0xc547}=0x1e20; # capital g with macron
87311µs$chars{0xc549}=0x012a; # capital i with macron
87415µs$chars{0xc54f}=0x014c; # capital o with macron
87512µs$chars{0xc555}=0x016a; # capital u with macron
87612µs$chars{0xc561}=0x0101; # small a with macron
87712µs$chars{0xc565}=0x0113; # small e with macron
87812µs$chars{0xc567}=0x1e21; # small g with macron
87912µs$chars{0xc569}=0x012b; # small i with macron
88016µs$chars{0xc56f}=0x014d; # small o with macron
88112µs$chars{0xc575}=0x016b; # small u with macron
88212µs$chars{0xc572}=0x0159; # small r with macron
88312µs$chars{0xc5e1}=0x01e2; # capital ae with macron
88411µs$chars{0xc5f1}=0x01e3; # small ae with macron
885 # 4/6 breve
88612µs$chars{0xc641}=0x0102; # capital a with breve
88712µs$chars{0xc645}=0x0114; # capital e with breve
88812µs$chars{0xc647}=0x011e; # capital g with breve
88912µs$chars{0xc649}=0x012c; # capital i with breve
89011µs$chars{0xc64f}=0x014e; # capital o with breve
89112µs$chars{0xc655}=0x016c; # capital u with breve
89211µs$chars{0xc661}=0x0103; # small a with breve
89312µs$chars{0xc665}=0x0115; # small e with breve
89411µs$chars{0xc667}=0x011f; # small g with breve
89512µs$chars{0xc669}=0x012d; # small i with breve
89613µs$chars{0xc66f}=0x014f; # small o with breve
89712µs$chars{0xc675}=0x016d; # small u with breve
898 # 4/7 dot above
89912µs$chars{0xc7b0}=0x01e1; # Ain with dot above
90012µs$chars{0xc742}=0x1e02; # capital b with dot above
90112µs$chars{0xc743}=0x010a; # capital c with dot above
90212µs$chars{0xc744}=0x1e0a; # capital d with dot above
90311µs$chars{0xc745}=0x0116; # capital e with dot above
90411µs$chars{0xc746}=0x1e1e; # capital f with dot above
90511µs$chars{0xc747}=0x0120; # capital g with dot above
90612µs$chars{0xc748}=0x1e22; # capital h with dot above
90712µs$chars{0xc749}=0x0130; # capital i with dot above
90811µs$chars{0xc74d}=0x1e40; # capital m with dot above
90911µs$chars{0xc74e}=0x1e44; # capital n with dot above
91011µs$chars{0xc750}=0x1e56; # capital p with dot above
91112µs$chars{0xc752}=0x1e58; # capital r with dot above
91212µs$chars{0xc753}=0x1e60; # capital s with dot above
91311µs$chars{0xc754}=0x1e6a; # capital t with dot above
91412µs$chars{0xc757}=0x1e86; # capital w with dot above
91516µs$chars{0xc758}=0x1e8a; # capital x with dot above
91612µs$chars{0xc759}=0x1e8e; # capital y with dot above
91711µs$chars{0xc75a}=0x017b; # capital z with dot above
91812µs$chars{0xc761}=0x0227; # small b with dot above
91911µs$chars{0xc762}=0x1e03; # small b with dot above
92011µs$chars{0xc763}=0x010b; # small c with dot above
92111µs$chars{0xc764}=0x1e0b; # small d with dot above
92216µs$chars{0xc765}=0x0117; # small e with dot above
92311µs$chars{0xc766}=0x1e1f; # small f with dot above
92411µs$chars{0xc767}=0x0121; # small g with dot above
92511µs$chars{0xc768}=0x1e23; # small h with dot above
92612µs$chars{0xc76d}=0x1e41; # small m with dot above
92711µs$chars{0xc76e}=0x1e45; # small n with dot above
92811µs$chars{0xc770}=0x1e57; # small p with dot above
92912µs$chars{0xc772}=0x1e59; # small r with dot above
93012µs$chars{0xc773}=0x1e61; # small s with dot above
93111µs$chars{0xc774}=0x1e6b; # small t with dot above
93211µs$chars{0xc777}=0x1e87; # small w with dot above
93311µs$chars{0xc778}=0x1e8b; # small x with dot above
93412µs$chars{0xc779}=0x1e8f; # small y with dot above
93511µs$chars{0xc77a}=0x017c; # small z with dot above
936 # 4/8 trema, diaresis
93712µs$chars{0xc820}=0x00a8; # diaeresis
93812µs$chars{0xc841}=0x00c4; # capital a with diaeresis
93912µs$chars{0xc845}=0x00cb; # capital e with diaeresis
94012µs$chars{0xc848}=0x1e26; # capital h with diaeresis
94112µs$chars{0xc849}=0x00cf; # capital i with diaeresis
94213µs$chars{0xc84f}=0x00d6; # capital o with diaeresis
94312µs$chars{0xc855}=0x00dc; # capital u with diaeresis
94412µs$chars{0xc857}=0x1e84; # capital w with diaeresis
94511µs$chars{0xc858}=0x1e8c; # capital x with diaeresis
94612µs$chars{0xc859}=0x0178; # capital y with diaeresis
94712µs$chars{0xc861}=0x00e4; # small a with diaeresis
94811µs$chars{0xc865}=0x00eb; # small e with diaeresis
94911µs$chars{0xc868}=0x1e27; # small h with diaeresis
95012µs$chars{0xc869}=0x00ef; # small i with diaeresis
95118µs$chars{0xc86f}=0x00f6; # small o with diaeresis
95212µs$chars{0xc874}=0x1e97; # small t with diaeresis
95311µs$chars{0xc875}=0x00fc; # small u with diaeresis
95412µs$chars{0xc877}=0x1e85; # small w with diaeresis
95511µs$chars{0xc878}=0x1e8d; # small x with diaeresis
95611µs$chars{0xc879}=0x00ff; # small y with diaeresis
957 # 4/9 umlaut
95811µs$chars{0xc920}=0x00a8; # [diaeresis]
95911µs$chars{0xc961}=0x00e4; # a with umlaut
96011µs$chars{0xc965}=0x00eb; # e with umlaut
96111µs$chars{0xc969}=0x00ef; # i with umlaut
96211µs$chars{0xc96f}=0x00f6; # o with umlaut
96311µs$chars{0xc975}=0x00fc; # u with umlaut
964 # 4/10 circle above
96511µs$chars{0xca41}=0x00c5; # capital a with ring above
96612µs$chars{0xcaad}=0x016e; # capital u with ring above
96711µs$chars{0xca61}=0x00e5; # small a with ring above
96811µs$chars{0xca75}=0x016f; # small u with ring above
96916µs$chars{0xca77}=0x1e98; # small w with ring above
97012µs$chars{0xca79}=0x1e99; # small y with ring above
971 # 4/11 high comma off centre
972 # 4/12 inverted high comma centred
973 # 4/13 double acute accent
97412µs$chars{0xcd4f}=0x0150; # capital o with double acute
97511µs$chars{0xcd55}=0x0170; # capital u with double acute
97612µs$chars{0xcd6f}=0x0151; # small o with double acute
97712µs$chars{0xcd75}=0x0171; # small u with double acute
978 # 4/14 horn
97912µs$chars{0xce54}=0x01a0; # latin capital letter o with horn
98012µs$chars{0xce55}=0x01af; # latin capital letter u with horn
98111µs$chars{0xce74}=0x01a1; # latin small letter o with horn
9821900ns$chars{0xce75}=0x01b0; # latin small letter u with horn
983 # 4/15 caron (hacek
9841900ns$chars{0xcf41}=0x01cd; # capital a with caron
9851900ns$chars{0xcf43}=0x010c; # capital c with caron
98611µs$chars{0xcf44}=0x010e; # capital d with caron
98711µs$chars{0xcf45}=0x011a; # capital e with caron
98812µs$chars{0xcf47}=0x01e6; # capital g with caron
9891900ns$chars{0xcf49}=0x01cf; # capital i with caron
99012µs$chars{0xcf4b}=0x01e8; # capital k with caron
99111µs$chars{0xcf4c}=0x013d; # capital l with caron
99212µs$chars{0xcf4e}=0x0147; # capital n with caron
99311µs$chars{0xcf4f}=0x01d1; # capital o with caron
99412µs$chars{0xcf52}=0x0158; # capital r with caron
99511µs$chars{0xcf53}=0x0160; # capital s with caron
99612µs$chars{0xcf54}=0x0164; # capital t with caron
99711µs$chars{0xcf55}=0x01d3; # capital u with caron
99811µs$chars{0xcf5a}=0x017d; # capital z with caron
99911µs$chars{0xcf61}=0x01ce; # small a with caron
100011µs$chars{0xcf63}=0x010d; # small c with caron
100111µs$chars{0xcf64}=0x010f; # small d with caron
100211µs$chars{0xcf65}=0x011b; # small e with caron
100312µs$chars{0xcf67}=0x01e7; # small g with caron
100411µs$chars{0xcf69}=0x01d0; # small i with caron
100512µs$chars{0xcf6a}=0x01f0; # small j with caron
100612µs$chars{0xcf6b}=0x01e9; # small k with caron
100711µs$chars{0xcf6c}=0x013e; # small l with caron
100812µs$chars{0xcf6e}=0x0148; # small n with caron
100912µs$chars{0xcf6f}=0x01d2; # small o with caron
101011µs$chars{0xcf72}=0x0159; # small r with caron
101112µs$chars{0xcf73}=0x0161; # small s with caron
101211µs$chars{0xcf74}=0x0165; # small t with caron
101312µs$chars{0xcf75}=0x01d4; # small u with caron
101411µs$chars{0xcf7a}=0x017e; # small z with caron
1015 # 5/0 cedilla
10161900ns$chars{0xd020}=0x00b8; # cedilla
101711µs$chars{0xd043}=0x00c7; # capital c with cedilla
101811µs$chars{0xd044}=0x1e10; # capital d with cedilla
1019114µs$chars{0xd047}=0x0122; # capital g with cedilla
102011µs$chars{0xd048}=0x1e28; # capital h with cedilla
102112µs$chars{0xd04b}=0x0136; # capital k with cedilla
102212µs$chars{0xd04c}=0x013b; # capital l with cedilla
10231900ns$chars{0xd04e}=0x0145; # capital n with cedilla
102412µs$chars{0xd052}=0x0156; # capital r with cedilla
10251800ns$chars{0xd053}=0x015e; # capital s with cedilla
102614µs$chars{0xd054}=0x0162; # capital t with cedilla
102712µs$chars{0xd063}=0x00e7; # small c with cedilla
102812µs$chars{0xd064}=0x1e11; # small d with cedilla
102911µs$chars{0xd065}=0x0119; # small e with cedilla
103011µs$chars{0xd067}=0x0123; # small g with cedilla
10311900ns$chars{0xd068}=0x1e29; # small h with cedilla
103211µs$chars{0xd06b}=0x0137; # small k with cedilla
103311µs$chars{0xd06c}=0x013c; # small l with cedilla
103411µs$chars{0xd06e}=0x0146; # small n with cedilla
103511µs$chars{0xd072}=0x0157; # small r with cedilla
103616µs$chars{0xd073}=0x015f; # small s with cedilla
103712µs$chars{0xd074}=0x0163; # small t with cedilla
1038 # 5/1 rude
1039 # 5/2 hook to left
1040 # 5/3 ogonek (hook to right
104114µs$chars{0xd320}=0x02db; # ogonek
104212µs$chars{0xd341}=0x0104; # capital a with ogonek
10431900ns$chars{0xd345}=0x0118; # capital e with ogonek
104412µs$chars{0xd349}=0x012e; # capital i with ogonek
104511µs$chars{0xd34f}=0x01ea; # capital o with ogonek
104611µs$chars{0xd355}=0x0172; # capital u with ogonek
104712µs$chars{0xd361}=0x0105; # small a with ogonek
104811µs$chars{0xd365}=0x0119; # small e with ogonek
104912µs$chars{0xd369}=0x012f; # small i with ogonek
105011µs$chars{0xd36f}=0x01eb; # small o with ogonek
105111µs$chars{0xd375}=0x0173; # small u with ogonek
1052 # 5/4 circle below
105313µs$chars{0xd441}=0x1e00; # capital a with ring below
105412µs$chars{0xd461}=0x1e01; # small a with ring below
1055 # 5/5 half circle below
105611µs$chars{0xf948}=0x1e2a; # capital h with breve below
105711µs$chars{0xf968}=0x1e2b; # small h with breve below
1058 # 5/6 dot below
105912µs$chars{0xd641}=0x1ea0; # capital a with dot below
106011µs$chars{0xd642}=0x1e04; # capital b with dot below
106111µs$chars{0xd644}=0x1e0c; # capital d with dot below
106211µs$chars{0xd645}=0x1eb8; # capital e with dot below
106311µs$chars{0xd648}=0x1e24; # capital h with dot below
106411µs$chars{0xd649}=0x1eca; # capital i with dot below
106511µs$chars{0xd64b}=0x1e32; # capital k with dot below
106612µs$chars{0xd64c}=0x1e36; # capital l with dot below
106712µs$chars{0xd64d}=0x1e42; # capital m with dot below
106812µs$chars{0xd64e}=0x1e46; # capital n with dot below
106911µs$chars{0xd64f}=0x1ecc; # capital o with dot below
107011µs$chars{0xd652}=0x1e5a; # capital r with dot below
107112µs$chars{0xd653}=0x1e62; # capital s with dot below
107213µs$chars{0xd654}=0x1e6c; # capital t with dot below
107312µs$chars{0xd655}=0x1ee4; # capital u with dot below
107412µs$chars{0xd656}=0x1e7e; # capital v with dot below
107512µs$chars{0xd657}=0x1e88; # capital w with dot below
107611µs$chars{0xd659}=0x1ef4; # capital y with dot below
107711µs$chars{0xd65a}=0x1e92; # capital z with dot below
107812µs$chars{0xd661}=0x1ea1; # small a with dot below
107912µs$chars{0xd662}=0x1e05; # small b with dot below
108012µs$chars{0xd664}=0x1e0d; # small d with dot below
108112µs$chars{0xd665}=0x1eb9; # small e with dot below
108211µs$chars{0xd668}=0x1e25; # small h with dot below
108312µs$chars{0xd669}=0x1ecb; # small i with dot below
108412µs$chars{0xd66b}=0x1e33; # small k with dot below
108512µs$chars{0xd66c}=0x1e37; # small l with dot below
108617µs$chars{0xd66d}=0x1e43; # small m with dot below
108712µs$chars{0xd66e}=0x1e47; # small n with dot below
108812µs$chars{0xd66f}=0x1ecd; # small o with dot below
108912µs$chars{0xd672}=0x1e5b; # small r with dot below
109011µs$chars{0xd673}=0x1e63; # small s with dot below
109112µs$chars{0xd674}=0x1e6d; # small t with dot below
109212µs$chars{0xd675}=0x1ee5; # small u with dot below
109312µs$chars{0xd676}=0x1e7f; # small v with dot below
109412µs$chars{0xd677}=0x1e89; # small w with dot below
109512µs$chars{0xd679}=0x1ef5; # small y with dot below
109611µs$chars{0xd67a}=0x1e93; # small z with dot below
1097 # 5/7 double dot below
109812µs$chars{0xd755}=0x1e72; # capital u with diaeresis below
109912µs$chars{0xd775}=0x1e73; # small u with diaeresis below
1100 # 5/8 underline
110112µs$chars{0xd820}=0x005f; # underline
1102 # 5/9 double underline
110312µs$chars{0xd920}=0x2017; # double underline
1104 # 5/10 small low vertical bar
110511µs$chars{0xda20}=0x02cc; #
1106 # 5/11 circumflex below
1107 # 5/12 (this position shall not be used)
1108 # 5/13 left half of ligature sign and of double tilde
1109 # 5/14 right half of ligature sign
1110 # 5/15 right half of double tilde
1111# map {printf "%x :%x\n",$_,$chars{$_};}keys %chars;
1112
1113sub char_decode5426 {
1114 my ( $string) = @_;
1115 my $result;
1116
1117 my @data = unpack("C*", $string);
1118 my @characters;
1119 my $length=scalar(@data);
1120 for (my $i = 0; $i < scalar(@data); $i++) {
1121 my $char= $data[$i];
1122 if ($char >= 0x00 && $char <= 0x7F){
1123 #IsAscii
1124
1125 push @characters,$char unless ($char<0x02 ||$char== 0x0F);
1126 }elsif (($char >= 0xC0 && $char <= 0xDF)) {
1127 #Combined Char
1128 my $convchar ;
1129 if ($chars{$char*256+$data[$i+1]}) {
1130 $convchar= $chars{$char * 256 + $data[$i+1]};
1131 $i++;
1132# printf "char %x $char, char to convert %x , converted %x\n",$char,$char * 256 + $data[$i - 1],$convchar;
1133 } elsif ($chars{$char}) {
1134 $convchar= $chars{$char};
1135# printf "0xC char %x, converted %x\n",$char,$chars{$char};
1136 }else {
1137 $convchar=$char;
1138 }
1139 push @characters,$convchar;
1140 } else {
1141 my $convchar;
1142 if ($chars{$char}) {
1143 $convchar= $chars{$char};
1144# printf "char %x, converted %x\n",$char,$chars{$char};
1145 }else {
1146# printf "char %x $char\n",$char;
1147 $convchar=$char;
1148 }
1149 push @characters,$convchar;
1150 }
1151 }
1152 $result=pack "U*",@characters;
1153# $result=~s/\x01//;
1154# $result=~s/\x00//;
1155 $result=~s/\x0f//;
1156 $result=~s/\x1b.//;
1157 $result=~s/\x0e//;
1158 $result=~s/\x1b\x5b//;
1159# map{printf "%x",$_} @characters;
1160# printf "\n";
1161 return $result;
1162}
1163
11641196µs1;
1165
1166
1167=head1 AUTHOR
1168
- -
 
# spent 9.47ms (9.22+246µs) within C4::Charset::CORE:subst which was called 25 times, avg 379µs/call: # 25 times (9.22ms+246µs) by C4::Charset::StripNonXmlChars at line 385, avg 379µs/call
sub C4::Charset::CORE:subst; # opcode