Profile of /usr/share/koha/lib/C4/Charset.pm

Filename	/usr/share/koha/lib/C4/Charset.pm
Statements	Executed 12360 statements in 65.9ms

Subroutines
Calls	P	F	Exclusive Time	Inclusive Time	Subroutine
25	1	1	25.4ms	114ms	C4::Charset::::SetUTF8FlagC4::Charset::SetUTF8Flag
1319	1	1	16.1ms	27.1ms	C4::Charset::::NormalizeStringC4::Charset::NormalizeString
25	1	1	9.22ms	9.47ms	C4::Charset::::CORE:substC4::Charset::CORE:subst (opcode)
25	1	1	582µs	10.0ms	C4::Charset::::StripNonXmlCharsC4::Charset::StripNonXmlChars
1	1	1	340µs	1.03ms	C4::Charset::::BEGIN@24C4::Charset::BEGIN@24
1	1	1	33µs	33µs	C4::Charset::::BEGIN@30C4::Charset::BEGIN@30
1	1	1	26µs	105µs	C4::Charset::::BEGIN@26C4::Charset::BEGIN@26
1	1	1	24µs	200µs	C4::Charset::::BEGIN@25C4::Charset::BEGIN@25
1	1	1	21µs	29µs	C4::Charset::::BEGIN@20C4::Charset::BEGIN@20
1	1	1	18µs	185µs	C4::Charset::::BEGIN@28C4::Charset::BEGIN@28
1	1	1	15µs	59µs	C4::Charset::::BEGIN@23C4::Charset::BEGIN@23
1	1	1	13µs	38µs	C4::Charset::::BEGIN@21C4::Charset::BEGIN@21
0	0	0	0s	0s	C4::Charset::::IsStringUTF8ishC4::Charset::IsStringUTF8ish
0	0	0	0s	0s	C4::Charset::::MarcToUTF8RecordC4::Charset::MarcToUTF8Record
0	0	0	0s	0s	C4::Charset::::SetMarcUnicodeFlagC4::Charset::SetMarcUnicodeFlag
0	0	0	0s	0s	C4::Charset::::__ANON__[:542]C4::Charset::__ANON__[:542]
0	0	0	0s	0s	C4::Charset::_default_marc21_charconv_to_utf8
0	0	0	0s	0s	C4::Charset::_default_unimarc_charconv_to_utf8
0	0	0	0s	0s	C4::Charset::::_marc_iso5426_to_utf8C4::Charset::_marc_iso5426_to_utf8
0	0	0	0s	0s	C4::Charset::::_marc_marc8_to_utf8C4::Charset::_marc_marc8_to_utf8
0	0	0	0s	0s	C4::Charset::_marc_to_utf8_replacement_char
0	0	0	0s	0s	C4::Charset::_marc_to_utf8_via_text_iconv
0	0	0	0s	0s	C4::Charset::::char_decode5426C4::Charset::char_decode5426
0	0	0	0s	0s	C4::Charset::::nsb_cleanC4::Charset::nsb_clean

Call graph for these subroutines as a Graphviz dot language file.

Line	State ments	Time on line	Calls	Time in subs	Code
1					package C4::Charset;
2
3					# Copyright (C) 2008 LibLime
4					#
5					# This file is part of Koha.
6					#
7					# Koha is free software; you can redistribute it and/or modify it under the
8					# terms of the GNU General Public License as published by the Free Software
9					# Foundation; either version 2 of the License, or (at your option) any later
10					# version.
11					#
12					# Koha is distributed in the hope that it will be useful, but WITHOUT ANY
13					# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14					# A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15					#
16					# You should have received a copy of the GNU General Public License along
17					# with Koha; if not, write to the Free Software Foundation, Inc.,
18					# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19
20	3	33µs	2	36µs	# spent 29µs (21+7) within C4::Charset::BEGIN@20 which was called: # once (21µs+7µs) by C4::Biblio::BEGIN@37 at line 20 use strict; # spent 29µs making 1 call to C4::Charset::BEGIN@20 # spent 7µs making 1 call to strict::import
21	3	34µs	2	63µs	# spent 38µs (13+25) within C4::Charset::BEGIN@21 which was called: # once (13µs+25µs) by C4::Biblio::BEGIN@37 at line 21 use warnings; # spent 38µs making 1 call to C4::Charset::BEGIN@21 # spent 25µs making 1 call to warnings::import
22
23	3	36µs	2	103µs	# spent 59µs (15+44) within C4::Charset::BEGIN@23 which was called: # once (15µs+44µs) by C4::Biblio::BEGIN@37 at line 23 use MARC::Charset qw/marc8_to_utf8/; # spent 59µs making 1 call to C4::Charset::BEGIN@23 # spent 44µs making 1 call to Exporter::import
24	3	149µs	2	1.06ms	# spent 1.03ms (340µs+693µs) within C4::Charset::BEGIN@24 which was called: # once (340µs+693µs) by C4::Biblio::BEGIN@37 at line 24 use Text::Iconv; # spent 1.03ms making 1 call to C4::Charset::BEGIN@24 # spent 27µs making 1 call to Exporter::import
25	3	65µs	2	376µs	# spent 200µs (24+176) within C4::Charset::BEGIN@25 which was called: # once (24µs+176µs) by C4::Biblio::BEGIN@37 at line 25 use C4::Debug; # spent 200µs making 1 call to C4::Charset::BEGIN@25 # spent 176µs making 1 call to Exporter::import
26	3	75µs	2	184µs	# spent 105µs (26+79) within C4::Charset::BEGIN@26 which was called: # once (26µs+79µs) by C4::Biblio::BEGIN@37 at line 26 use Unicode::Normalize; # spent 105µs making 1 call to C4::Charset::BEGIN@26 # spent 79µs making 1 call to Exporter::import
27
28	3	117µs	2	352µs	# spent 185µs (18+167) within C4::Charset::BEGIN@28 which was called: # once (18µs+167µs) by C4::Biblio::BEGIN@37 at line 28 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); # spent 185µs making 1 call to C4::Charset::BEGIN@28 # spent 167µs making 1 call to vars::import
29
30					# spent 33µs within C4::Charset::BEGIN@30 which was called: # once (33µs+0s) by C4::Biblio::BEGIN@37 at line 44 BEGIN {
31					# set the version for version checking
32	4	33µs			$VERSION = 3.07.00.049;
33					require Exporter;
34					@ISA = qw(Exporter);
35					@EXPORT = qw(
36					NormalizeString
37					IsStringUTF8ish
38					MarcToUTF8Record
39					SetUTF8Flag
40					SetMarcUnicodeFlag
41					StripNonXmlChars
42					nsb_clean
43					);
44	1	6.06ms	1	33µs	} # spent 33µs making 1 call to C4::Charset::BEGIN@30
45
46					=head1 NAME
47
- -
77					=head1 FUNCTIONS
78
- -
107					sub IsStringUTF8ish {
108					my $str = shift;
109
110					return 1 if utf8::is_utf8($str);
111					return utf8::decode($str);
112					}
113
114					=head2 SetUTF8Flag
115
- -
133					# spent 114ms (25.4+88.6) within C4::Charset::SetUTF8Flag which was called 25 times, avg 4.56ms/call: # 25 times (25.4ms+88.6ms) by C4::Search::searchResults at line 2011 of /usr/share/koha/lib/C4/Search.pm, avg 4.56ms/call sub SetUTF8Flag{
134	3986	20.9ms			my ($record, $nfd)=@_;
135			25	176µs	return unless ($record && $record->fields()); # spent 176µs making 25 calls to MARC::Record::fields, avg 7µs/call
136			25	180µs	foreach my $field ($record->fields()){ # spent 180µs making 25 calls to MARC::Record::fields, avg 7µs/call
137			544	2.54ms	if ($field->tag()>=10){ # spent 2.54ms making 544 calls to MARC::Field::tag, avg 5µs/call
138					my @subfields;
139			512	11.1ms	foreach my $subfield ($field->subfields()){ # spent 11.1ms making 512 calls to MARC::Field::subfields, avg 22µs/call
140			1319	27.1ms	push @subfields,($$subfield[0],NormalizeString($$subfield[1],$nfd)); # spent 27.1ms making 1319 calls to C4::Charset::NormalizeString, avg 21µs/call
141					}
142			2048	38.7ms	my $newfield=MARC::Field->new( # spent 26.4ms making 512 calls to MARC::Field::new, avg 52µs/call # spent 10.0ms making 1024 calls to MARC::Field::indicator, avg 10µs/call # spent 2.24ms making 512 calls to MARC::Field::tag, avg 4µs/call
143					$field->tag(),
144					$field->indicator(1),
145					$field->indicator(2),
146					@subfields
147					);
148			512	8.87ms	$field->replace_with($newfield); # spent 8.87ms making 512 calls to MARC::Field::replace_with, avg 17µs/call
149					}
150					}
151					}
152
153					=head2 NormalizeString
154
- -
173					# spent 27.1ms (16.1+11.0) within C4::Charset::NormalizeString which was called 1319 times, avg 21µs/call: # 1319 times (16.1ms+11.0ms) by C4::Charset::SetUTF8Flag at line 140, avg 21µs/call sub NormalizeString{
174	7914	27.8ms			my ($string,$nfd,$transform)=@_;
175			2638	2.95ms	utf8::decode($string) unless (utf8::is_utf8($string)); # spent 1.70ms making 1319 calls to utf8::decode, avg 1µs/call # spent 1.25ms making 1319 calls to utf8::is_utf8, avg 945ns/call
176					if ($nfd){
177					$string= NFD($string);
178					}
179					else {
180			1319	8.06ms	$string=NFC($string); # spent 8.06ms making 1319 calls to Unicode::Normalize::NFC, avg 6µs/call
181					}
182					if ($transform){
183					$string=~s/\<\|\>\|\^\|\;\|\.\|\?\|,\|\-\|$\|$\|\[\|\]\|\{\|\}\|\$\|\%\|\!\|\*\|\:\|\\\|\/\|\&\|\"\|\'/ /g;
184					#removing one letter words "d'" "l'" was changed into "d " "l "
185					$string=~s/\b\S\b//g;
186					$string=~s/\s+$//g;
187					}
188					return $string;
189					}
190
191					=head2 MarcToUTF8Record
192
- -
212					sub MarcToUTF8Record {
213					my $marc = shift;
214					my $marc_flavour = shift;
215					my $source_encoding = shift;
216					my $marc_record;
217					my $marc_blob_is_utf8 = 0;
218					if (ref($marc) eq 'MARC::Record') {
219					my $marc_blob = $marc->as_usmarc();
220					$marc_blob_is_utf8 = IsStringUTF8ish($marc_blob);
221					$marc_record = $marc;
222					} else {
223					# dealing with a MARC blob
224
225					# remove any ersatz whitespace from the beginning and
226					# end of the MARC blob -- these can creep into MARC
227					# files produced by several sources -- caller really
228					# should be doing this, however
229					$marc =~ s/^\s+//;
230					$marc =~ s/\s+$//;
231					$marc_blob_is_utf8 = IsStringUTF8ish($marc);
232					eval {
233					$marc_record = MARC::Record->new_from_usmarc($marc);
234					};
235					if ($@) {
236					# if we fail the first time, one likely problem
237					# is that we have a MARC21 record that says that it's
238					# UTF-8 (Leader/09 = 'a') but contains non-UTF-8 characters.
239					# We'll try parsing it again.
240					substr($marc, 9, 1) = ' ';
241					eval {
242					$marc_record = MARC::Record->new_from_usmarc($marc);
243					};
244					if ($@) {
245					# it's hopeless; return an empty MARC::Record
246					return MARC::Record->new(), 'failed', ['could not parse MARC blob'];
247					}
248					}
249					}
250
251					# If we do not know the source encoding, try some guesses
252					# as follows:
253					# 1. Record is UTF-8 already.
254					# 2. If MARC flavor is MARC21 or NORMARC, then
255					# a. record is MARC-8
256					# b. record is ISO-8859-1
257					# 3. If MARC flavor is UNIMARC, then
258					if (not defined $source_encoding) {
259					if ($marc_blob_is_utf8) {
260					# note that for MARC21/NORMARC we are not bothering to check
261					# if the Leader/09 is set to 'a' or not -- because
262					# of problems with various ILSs (including Koha in the
263					# past, alas), this just is not trustworthy.
264					SetMarcUnicodeFlag($marc_record, $marc_flavour);
265					return $marc_record, 'UTF-8', [];
266					} else {
267					if ($marc_flavour eq 'MARC21' \|\| $marc_flavour eq 'NORMARC') {
268					return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
269					} elsif ($marc_flavour =~/UNIMARC/) {
270					return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour);
271					} else {
272					return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
273					}
274					}
275					} else {
276					# caller knows the character encoding
277					my $original_marc_record = $marc_record->clone();
278					my @errors;
279					if ($source_encoding =~ /utf-?8/i) {
280					if ($marc_blob_is_utf8) {
281					SetMarcUnicodeFlag($marc_record, $marc_flavour);
282					return $marc_record, 'UTF-8', [];
283					} else {
284					push @errors, 'specified UTF-8 => UTF-8 conversion, but record is not in UTF-8';
285					}
286					} elsif ($source_encoding =~ /marc-?8/i) {
287					@errors = _marc_marc8_to_utf8($marc_record, $marc_flavour);
288					} elsif ($source_encoding =~ /5426/) {
289					@errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour);
290					} else {
291					# assume any other character encoding is for Text::Iconv
292					@errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
293					}
294
295					if (@errors) {
296					_marc_to_utf8_replacement_char($original_marc_record, $marc_flavour);
297					return $original_marc_record, 'failed', \@errors;
298					} else {
299					return $marc_record, $source_encoding, [];
300					}
301					}
302
303					}
304
305					=head2 SetMarcUnicodeFlag
306
- -
317					sub SetMarcUnicodeFlag {
318					my $marc_record = shift;
319					my $marc_flavour = shift; # \|\| C4::Context->preference("marcflavour");
320
321					$marc_record->encoding('UTF-8');
322					if ($marc_flavour eq 'MARC21' \|\| $marc_flavour eq 'NORMARC') {
323					my $leader = $marc_record->leader();
324					substr($leader, 9, 1) = 'a';
325					$marc_record->leader($leader);
326					} elsif ($marc_flavour =~/UNIMARC/) {
327					my $defaultlanguage = C4::Context->preference("UNIMARCField100Language");
328					$defaultlanguage = "fre" if (!$defaultlanguage \|\| length($defaultlanguage) != 3);
329					my $string;
330					my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?(21,12):(36,25));
331					$string=$marc_record->subfield( 100, "a" );
332					if (defined $string && length($string)==$subflength) {
333					$string = substr $string, 0,$subflength if (length($string)>$subflength);
334					}
335					else {
336					$string = POSIX::strftime( "%Y%m%d", localtime );
337					$string =~ s/\-//g;
338					$string = sprintf( "%-*s", $subflength, $string );
339					substr ( $string, ($encodingposition - 3), 3, $defaultlanguage);
340					}
341					substr( $string, $encodingposition, 3, "y50" );
342					if ( $marc_record->subfield( 100, "a" ) ) {
343					$marc_record->field('100')->update(a=>$string);
344					}
345					else {
346					$marc_record->insert_grouped_field(
347					MARC::Field->new( 100, '', '', "a" => $string ) );
348					}
349					$debug && warn "encodage: ", substr( $marc_record->subfield(100, 'a'), $encodingposition, 3 );
350					} else {
351					warn "Unrecognized marcflavour: $marc_flavour";
352					}
353					}
354
355					=head2 StripNonXmlChars
356
- -
380					# spent 10.0ms (582µs+9.47) within C4::Charset::StripNonXmlChars which was called 25 times, avg 402µs/call: # 25 times (582µs+9.47ms) by C4::Biblio::GetMarcBiblio at line 1258 of /usr/share/koha/lib/C4/Biblio.pm, avg 402µs/call sub StripNonXmlChars {
381	100	8.96ms			my $str = shift;
382					if (!defined($str) \|\| $str eq ""){
383					return "";
384					}
385	1	796µs	26	9.71ms	$str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g; # spent 9.47ms making 25 calls to C4::Charset::CORE:subst, avg 379µs/call # spent 246µs making 1 call to utf8::SWASHNEW
386					return $str;
387					}
388
- -
391					=head2 nsb_clean
392
- -
402					sub nsb_clean {
403					my $NSB = '\x88' ; # NSB : begin Non Sorting Block
404					my $NSE = '\x89' ; # NSE : Non Sorting Block end
405					my $NSB2 = '\x98' ; # NSB : begin Non Sorting Block
406					my $NSE2 = '\x9C' ; # NSE : Non Sorting Block end
407					my $C2 = '\xC2' ; # What is this char ? It is sometimes left by the regexp after removing NSB / NSE
408
409					# handles non sorting blocks
410					my ($string) = @_ ;
411					$_ = $string ;
412					s/$NSB//g ;
413					s/$NSE//g ;
414					s/$NSB2//g ;
415					s/$NSE2//g ;
416					s/$C2//g ;
417					$string = $_ ;
418
419					return($string) ;
420					}
421
422
423					=head1 INTERNAL FUNCTIONS
424
- -
441					sub _default_marc21_charconv_to_utf8 {
442					my $marc_record = shift;
443					my $marc_flavour = shift;
444
445					my $trial_marc8 = $marc_record->clone();
446					my @all_errors = ();
447					my @errors = _marc_marc8_to_utf8($trial_marc8, $marc_flavour);
448					unless (@errors) {
449					return $trial_marc8, 'MARC-8', [];
450					}
451					push @all_errors, @errors;
452
453					my $trial_8859_1 = $marc_record->clone();
454					@errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
455					unless (@errors) {
456					return $trial_8859_1, 'iso-8859-1', []; # note -- we could return \@all_errors
457					# instead if we wanted to report details
458					# of the failed attempt at MARC-8 => UTF-8
459					}
460					push @all_errors, @errors;
461
462					my $default_converted = $marc_record->clone();
463					_marc_to_utf8_replacement_char($default_converted, $marc_flavour);
464					return $default_converted, 'failed', \@all_errors;
465					}
466
467					=head2 _default_unimarc_charconv_to_utf8
468
- -
483					sub _default_unimarc_charconv_to_utf8 {
484					my $marc_record = shift;
485					my $marc_flavour = shift;
486
487					my $trial_marc8 = $marc_record->clone();
488					my @all_errors = ();
489					my @errors = _marc_iso5426_to_utf8($trial_marc8, $marc_flavour);
490					unless (@errors) {
491					return $trial_marc8, 'iso-5426';
492					}
493					push @all_errors, @errors;
494
495					my $trial_8859_1 = $marc_record->clone();
496					@errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
497					unless (@errors) {
498					return $trial_8859_1, 'iso-8859-1';
499					}
500					push @all_errors, @errors;
501
502					my $default_converted = $marc_record->clone();
503					_marc_to_utf8_replacement_char($default_converted, $marc_flavour);
504					return $default_converted, 'failed', \@all_errors;
505					}
506
507					=head2 _marc_marc8_to_utf8
508
- -
518					sub _marc_marc8_to_utf8 {
519					my $marc_record = shift;
520					my $marc_flavour = shift;
521
522					my $prev_ignore = MARC::Charset->ignore_errors();
523					MARC::Charset->ignore_errors(1);
524
525					# trap warnings raised by MARC::Charset
526					my @errors = ();
527					local $SIG{__WARN__} = sub {
528					my $msg = $_[0];
529					if ($msg =~ /MARC.Charset/) {
530					# FIXME - purpose of this regexp is to strip out the
531					# line reference to MARC/Charset.pm, but as it
532					# exists probably won't work quite on Windows --
533					# some sort of minimal-bunch back-tracking RE
534					# would be helpful here
535					$msg =~ s/at [\/].*?.MARC.Charset\.pm line \d+\.\n$//;
536					push @errors, $msg;
537					} else {
538					# if warning doesn't come from MARC::Charset, just
539					# pass it on
540					warn $msg;
541					}
542					};
543
544					foreach my $field ($marc_record->fields()) {
545					if ($field->is_control_field()) {
546					; # do nothing -- control fields should not contain non-ASCII characters
547					} else {
548					my @converted_subfields;
549					foreach my $subfield ($field->subfields()) {
550					my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]);
551					unless (IsStringUTF8ish($utf8sf)) {
552					# Because of a bug in MARC::Charset 0.98, if the string
553					# has (a) one or more diacritics that (b) are only in character positions
554					# 128 to 255 inclusive, the resulting converted string is not in
555					# UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1). If that
556					# occurs, upgrade the string in place. Moral of the story seems to be
557					# that pack("U", ...) is better than chr(...) if you need to guarantee
558					# that the resulting string is UTF-8.
559					utf8::upgrade($utf8sf);
560					}
561					push @converted_subfields, $subfield->[0], $utf8sf;
562					}
563
564					$field->replace_with(MARC::Field->new(
565					$field->tag(), $field->indicator(1), $field->indicator(2),
566					@converted_subfields)
567					);
568					}
569					}
570
571					MARC::Charset->ignore_errors($prev_ignore);
572
573					SetMarcUnicodeFlag($marc_record, $marc_flavour);
574
575					return @errors;
576					}
577
578					=head2 _marc_iso5426_to_utf8
579
- -
592					sub _marc_iso5426_to_utf8 {
593					my $marc_record = shift;
594					my $marc_flavour = shift;
595
596					my @errors = ();
597
598					foreach my $field ($marc_record->fields()) {
599					if ($field->is_control_field()) {
600					; # do nothing -- control fields should not contain non-ASCII characters
601					} else {
602					my @converted_subfields;
603					foreach my $subfield ($field->subfields()) {
604					my $utf8sf = char_decode5426($subfield->[1]);
605					push @converted_subfields, $subfield->[0], $utf8sf;
606					}
607
608					$field->replace_with(MARC::Field->new(
609					$field->tag(), $field->indicator(1), $field->indicator(2),
610					@converted_subfields)
611					);
612					}
613					}
614
615					SetMarcUnicodeFlag($marc_record, $marc_flavour);
616
617					return @errors;
618					}
619
620					=head2 _marc_to_utf8_via_text_iconv
621
- -
634					sub _marc_to_utf8_via_text_iconv {
635					my $marc_record = shift;
636					my $marc_flavour = shift;
637					my $source_encoding = shift;
638
639					my @errors = ();
640					my $decoder;
641					eval { $decoder = Text::Iconv->new($source_encoding, 'utf8'); };
642					if ($@) {
643					push @errors, "Could not initialze $source_encoding => utf8 converter: $@";
644					return @errors;
645					}
646
647					my $prev_raise_error = Text::Iconv->raise_error();
648					Text::Iconv->raise_error(1);
649
650					foreach my $field ($marc_record->fields()) {
651					if ($field->is_control_field()) {
652					; # do nothing -- control fields should not contain non-ASCII characters
653					} else {
654					my @converted_subfields;
655					foreach my $subfield ($field->subfields()) {
656					my $converted_value;
657					my $conversion_ok = 1;
658					eval { $converted_value = $decoder->convert($subfield->[1]); };
659					if ($@) {
660					$conversion_ok = 0;
661					push @errors, $@;
662					} elsif (not defined $converted_value) {
663					$conversion_ok = 0;
664					push @errors, "Text::Iconv conversion failed - retval is " . $decoder->retval();
665					}
666
667					if ($conversion_ok) {
668					push @converted_subfields, $subfield->[0], $converted_value;
669					} else {
670					$converted_value = $subfield->[1];
671					$converted_value =~ s/[\200-\377]/\xef\xbf\xbd/g;
672					push @converted_subfields, $subfield->[0], $converted_value;
673					}
674					}
675
676					$field->replace_with(MARC::Field->new(
677					$field->tag(), $field->indicator(1), $field->indicator(2),
678					@converted_subfields)
679					);
680					}
681					}
682
683					SetMarcUnicodeFlag($marc_record, $marc_flavour);
684					Text::Iconv->raise_error($prev_raise_error);
685
686					return @errors;
687					}
688
689					=head2 _marc_to_utf8_replacement_char
690
- -
703					sub _marc_to_utf8_replacement_char {
704					my $marc_record = shift;
705					my $marc_flavour = shift;
706
707					foreach my $field ($marc_record->fields()) {
708					if ($field->is_control_field()) {
709					; # do nothing -- control fields should not contain non-ASCII characters
710					} else {
711					my @converted_subfields;
712					foreach my $subfield ($field->subfields()) {
713					my $value = $subfield->[1];
714					$value =~ s/[\200-\377]/\xef\xbf\xbd/g;
715					push @converted_subfields, $subfield->[0], $value;
716					}
717
718					$field->replace_with(MARC::Field->new(
719					$field->tag(), $field->indicator(1), $field->indicator(2),
720					@converted_subfields)
721					);
722					}
723					}
724
725					SetMarcUnicodeFlag($marc_record, $marc_flavour);
726					}
727
728					=head2 char_decode5426
729
- -
737	1	900ns			my %chars;
738	1	8µs			$chars{0xb0}=0x0101;#3/0ayn[ain]
739	1	3µs			$chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove]
740					#$chars{0xb2}=0x00e0;#'à';
741	1	2µs			$chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark
742					#$chars{0xb3}=0x00e7;#'ç';
743	1	2µs			$chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark
744					# $chars{0xb4}='è';
745	1	1µs			$chars{0xb4}=0x00e8;
746	1	1µs			$chars{0xbd}=0x02b9;
747	1	2µs			$chars{0xbe}=0x02ba;
748					# $chars{0xb5}='é';
749	1	2µs			$chars{0xb5}=0x00e9;
750	1	3µs			$chars{0x97}=0x003c;#3/2leftlowsinglequotationmark
751	1	1µs			$chars{0x98}=0x003e;#3/2leftlowsinglequotationmark
752	1	1µs			$chars{0xfa}=0x0153; #oe
753	1	3µs			$chars{0xea}=0x0152; #oe
754	1	2µs			$chars{0x81d1}=0x00b0;
755
756					####
757					## combined characters iso5426
758
759	1	1µs			$chars{0xc041}=0x1ea2; # capital a with hook above
760	1	1µs			$chars{0xc045}=0x1eba; # capital e with hook above
761	1	1µs			$chars{0xc049}=0x1ec8; # capital i with hook above
762	1	2µs			$chars{0xc04f}=0x1ece; # capital o with hook above
763	1	2µs			$chars{0xc055}=0x1ee6; # capital u with hook above
764	1	2µs			$chars{0xc059}=0x1ef6; # capital y with hook above
765	1	1µs			$chars{0xc061}=0x1ea3; # small a with hook above
766	1	2µs			$chars{0xc065}=0x1ebb; # small e with hook above
767	1	1µs			$chars{0xc069}=0x1ec9; # small i with hook above
768	1	2µs			$chars{0xc06f}=0x1ecf; # small o with hook above
769	1	2µs			$chars{0xc075}=0x1ee7; # small u with hook above
770	1	2µs			$chars{0xc079}=0x1ef7; # small y with hook above
771
772					# 4/1 grave accent
773	1	2µs			$chars{0xc141}=0x00c0; # capital a with grave accent
774	1	9µs			$chars{0xc145}=0x00c8; # capital e with grave accent
775	1	3µs			$chars{0xc149}=0x00cc; # capital i with grave accent
776	1	1µs			$chars{0xc14f}=0x00d2; # capital o with grave accent
777	1	2µs			$chars{0xc155}=0x00d9; # capital u with grave accent
778	1	2µs			$chars{0xc157}=0x1e80; # capital w with grave
779	1	3µs			$chars{0xc159}=0x1ef2; # capital y with grave
780	1	2µs			$chars{0xc161}=0x00e0; # small a with grave accent
781	1	2µs			$chars{0xc165}=0x00e8; # small e with grave accent
782	1	2µs			$chars{0xc169}=0x00ec; # small i with grave accent
783	1	2µs			$chars{0xc16f}=0x00f2; # small o with grave accent
784	1	2µs			$chars{0xc175}=0x00f9; # small u with grave accent
785	1	2µs			$chars{0xc177}=0x1e81; # small w with grave
786	1	2µs			$chars{0xc179}=0x1ef3; # small y with grave
787					# 4/2 acute accent
788	1	2µs			$chars{0xc241}=0x00c1; # capital a with acute accent
789	1	2µs			$chars{0xc243}=0x0106; # capital c with acute accent
790	1	1µs			$chars{0xc245}=0x00c9; # capital e with acute accent
791	1	2µs			$chars{0xc247}=0x01f4; # capital g with acute
792	1	2µs			$chars{0xc249}=0x00cd; # capital i with acute accent
793	1	2µs			$chars{0xc24b}=0x1e30; # capital k with acute
794	1	3µs			$chars{0xc24c}=0x0139; # capital l with acute accent
795	1	1µs			$chars{0xc24d}=0x1e3e; # capital m with acute
796	1	3µs			$chars{0xc24e}=0x0143; # capital n with acute accent
797	1	1µs			$chars{0xc24f}=0x00d3; # capital o with acute accent
798	1	2µs			$chars{0xc250}=0x1e54; # capital p with acute
799	1	2µs			$chars{0xc252}=0x0154; # capital r with acute accent
800	1	1µs			$chars{0xc253}=0x015a; # capital s with acute accent
801	1	1µs			$chars{0xc255}=0x00da; # capital u with acute accent
802	1	1µs			$chars{0xc257}=0x1e82; # capital w with acute
803	1	1µs			$chars{0xc259}=0x00dd; # capital y with acute accent
804	1	1µs			$chars{0xc25a}=0x0179; # capital z with acute accent
805	1	2µs			$chars{0xc261}=0x00e1; # small a with acute accent
806	1	1µs			$chars{0xc263}=0x0107; # small c with acute accent
807	1	1µs			$chars{0xc265}=0x00e9; # small e with acute accent
808	1	1µs			$chars{0xc267}=0x01f5; # small g with acute
809	1	2µs			$chars{0xc269}=0x00ed; # small i with acute accent
810	1	2µs			$chars{0xc26b}=0x1e31; # small k with acute
811	1	2µs			$chars{0xc26c}=0x013a; # small l with acute accent
812	1	9µs			$chars{0xc26d}=0x1e3f; # small m with acute
813	1	2µs			$chars{0xc26e}=0x0144; # small n with acute accent
814	1	2µs			$chars{0xc26f}=0x00f3; # small o with acute accent
815	1	1µs			$chars{0xc270}=0x1e55; # small p with acute
816	1	1µs			$chars{0xc272}=0x0155; # small r with acute accent
817	1	1µs			$chars{0xc273}=0x015b; # small s with acute accent
818	1	2µs			$chars{0xc275}=0x00fa; # small u with acute accent
819	1	1µs			$chars{0xc277}=0x1e83; # small w with acute
820	1	1µs			$chars{0xc279}=0x00fd; # small y with acute accent
821	1	1µs			$chars{0xc27a}=0x017a; # small z with acute accent
822	1	2µs			$chars{0xc2e1}=0x01fc; # capital ae with acute
823	1	1µs			$chars{0xc2f1}=0x01fd; # small ae with acute
824					# 4/3 circumflex accent
825	1	2µs			$chars{0xc341}=0x00c2; # capital a with circumflex accent
826	1	1µs			$chars{0xc343}=0x0108; # capital c with circumflex
827	1	2µs			$chars{0xc345}=0x00ca; # capital e with circumflex accent
828	1	2µs			$chars{0xc347}=0x011c; # capital g with circumflex
829	1	1µs			$chars{0xc348}=0x0124; # capital h with circumflex
830	1	1µs			$chars{0xc349}=0x00ce; # capital i with circumflex accent
831	1	2µs			$chars{0xc34a}=0x0134; # capital j with circumflex
832	1	1µs			$chars{0xc34f}=0x00d4; # capital o with circumflex accent
833	1	2µs			$chars{0xc353}=0x015c; # capital s with circumflex
834	1	1µs			$chars{0xc355}=0x00db; # capital u with circumflex
835	1	2µs			$chars{0xc357}=0x0174; # capital w with circumflex
836	1	2µs			$chars{0xc359}=0x0176; # capital y with circumflex
837	1	1µs			$chars{0xc35a}=0x1e90; # capital z with circumflex
838	1	1µs			$chars{0xc361}=0x00e2; # small a with circumflex accent
839	1	2µs			$chars{0xc363}=0x0109; # small c with circumflex
840	1	2µs			$chars{0xc365}=0x00ea; # small e with circumflex accent
841	1	1µs			$chars{0xc367}=0x011d; # small g with circumflex
842	1	2µs			$chars{0xc368}=0x0125; # small h with circumflex
843	1	2µs			$chars{0xc369}=0x00ee; # small i with circumflex accent
844	1	2µs			$chars{0xc36a}=0x0135; # small j with circumflex
845	1	2µs			$chars{0xc36e}=0x00f1; # small n with tilde
846	1	2µs			$chars{0xc36f}=0x00f4; # small o with circumflex accent
847	1	2µs			$chars{0xc373}=0x015d; # small s with circumflex
848	1	2µs			$chars{0xc375}=0x00fb; # small u with circumflex
849	1	2µs			$chars{0xc377}=0x0175; # small w with circumflex
850	1	2µs			$chars{0xc379}=0x0177; # small y with circumflex
851	1	1µs			$chars{0xc37a}=0x1e91; # small z with circumflex
852					# 4/4 tilde
853	1	2µs			$chars{0xc441}=0x00c3; # capital a with tilde
854	1	1µs			$chars{0xc445}=0x1ebc; # capital e with tilde
855	1	1µs			$chars{0xc449}=0x0128; # capital i with tilde
856	1	1µs			$chars{0xc44e}=0x00d1; # capital n with tilde
857	1	1µs			$chars{0xc44f}=0x00d5; # capital o with tilde
858	1	2µs			$chars{0xc455}=0x0168; # capital u with tilde
859	1	1µs			$chars{0xc456}=0x1e7c; # capital v with tilde
860	1	2µs			$chars{0xc459}=0x1ef8; # capital y with tilde
861	1	2µs			$chars{0xc461}=0x00e3; # small a with tilde
862	1	2µs			$chars{0xc465}=0x1ebd; # small e with tilde
863	1	8µs			$chars{0xc469}=0x0129; # small i with tilde
864	1	2µs			$chars{0xc46e}=0x00f1; # small n with tilde
865	1	1µs			$chars{0xc46f}=0x00f5; # small o with tilde
866	1	1µs			$chars{0xc475}=0x0169; # small u with tilde
867	1	2µs			$chars{0xc476}=0x1e7d; # small v with tilde
868	1	1µs			$chars{0xc479}=0x1ef9; # small y with tilde
869					# 4/5 macron
870	1	2µs			$chars{0xc541}=0x0100; # capital a with macron
871	1	2µs			$chars{0xc545}=0x0112; # capital e with macron
872	1	2µs			$chars{0xc547}=0x1e20; # capital g with macron
873	1	1µs			$chars{0xc549}=0x012a; # capital i with macron
874	1	5µs			$chars{0xc54f}=0x014c; # capital o with macron
875	1	2µs			$chars{0xc555}=0x016a; # capital u with macron
876	1	2µs			$chars{0xc561}=0x0101; # small a with macron
877	1	2µs			$chars{0xc565}=0x0113; # small e with macron
878	1	2µs			$chars{0xc567}=0x1e21; # small g with macron
879	1	2µs			$chars{0xc569}=0x012b; # small i with macron
880	1	6µs			$chars{0xc56f}=0x014d; # small o with macron
881	1	2µs			$chars{0xc575}=0x016b; # small u with macron
882	1	2µs			$chars{0xc572}=0x0159; # small r with macron
883	1	2µs			$chars{0xc5e1}=0x01e2; # capital ae with macron
884	1	1µs			$chars{0xc5f1}=0x01e3; # small ae with macron
885					# 4/6 breve
886	1	2µs			$chars{0xc641}=0x0102; # capital a with breve
887	1	2µs			$chars{0xc645}=0x0114; # capital e with breve
888	1	2µs			$chars{0xc647}=0x011e; # capital g with breve
889	1	2µs			$chars{0xc649}=0x012c; # capital i with breve
890	1	1µs			$chars{0xc64f}=0x014e; # capital o with breve
891	1	2µs			$chars{0xc655}=0x016c; # capital u with breve
892	1	1µs			$chars{0xc661}=0x0103; # small a with breve
893	1	2µs			$chars{0xc665}=0x0115; # small e with breve
894	1	1µs			$chars{0xc667}=0x011f; # small g with breve
895	1	2µs			$chars{0xc669}=0x012d; # small i with breve
896	1	3µs			$chars{0xc66f}=0x014f; # small o with breve
897	1	2µs			$chars{0xc675}=0x016d; # small u with breve
898					# 4/7 dot above
899	1	2µs			$chars{0xc7b0}=0x01e1; # Ain with dot above
900	1	2µs			$chars{0xc742}=0x1e02; # capital b with dot above
901	1	2µs			$chars{0xc743}=0x010a; # capital c with dot above
902	1	2µs			$chars{0xc744}=0x1e0a; # capital d with dot above
903	1	1µs			$chars{0xc745}=0x0116; # capital e with dot above
904	1	1µs			$chars{0xc746}=0x1e1e; # capital f with dot above
905	1	1µs			$chars{0xc747}=0x0120; # capital g with dot above
906	1	2µs			$chars{0xc748}=0x1e22; # capital h with dot above
907	1	2µs			$chars{0xc749}=0x0130; # capital i with dot above
908	1	1µs			$chars{0xc74d}=0x1e40; # capital m with dot above
909	1	1µs			$chars{0xc74e}=0x1e44; # capital n with dot above
910	1	1µs			$chars{0xc750}=0x1e56; # capital p with dot above
911	1	2µs			$chars{0xc752}=0x1e58; # capital r with dot above
912	1	2µs			$chars{0xc753}=0x1e60; # capital s with dot above
913	1	1µs			$chars{0xc754}=0x1e6a; # capital t with dot above
914	1	2µs			$chars{0xc757}=0x1e86; # capital w with dot above
915	1	6µs			$chars{0xc758}=0x1e8a; # capital x with dot above
916	1	2µs			$chars{0xc759}=0x1e8e; # capital y with dot above
917	1	1µs			$chars{0xc75a}=0x017b; # capital z with dot above
918	1	2µs			$chars{0xc761}=0x0227; # small b with dot above
919	1	1µs			$chars{0xc762}=0x1e03; # small b with dot above
920	1	1µs			$chars{0xc763}=0x010b; # small c with dot above
921	1	1µs			$chars{0xc764}=0x1e0b; # small d with dot above
922	1	6µs			$chars{0xc765}=0x0117; # small e with dot above
923	1	1µs			$chars{0xc766}=0x1e1f; # small f with dot above
924	1	1µs			$chars{0xc767}=0x0121; # small g with dot above
925	1	1µs			$chars{0xc768}=0x1e23; # small h with dot above
926	1	2µs			$chars{0xc76d}=0x1e41; # small m with dot above
927	1	1µs			$chars{0xc76e}=0x1e45; # small n with dot above
928	1	1µs			$chars{0xc770}=0x1e57; # small p with dot above
929	1	2µs			$chars{0xc772}=0x1e59; # small r with dot above
930	1	2µs			$chars{0xc773}=0x1e61; # small s with dot above
931	1	1µs			$chars{0xc774}=0x1e6b; # small t with dot above
932	1	1µs			$chars{0xc777}=0x1e87; # small w with dot above
933	1	1µs			$chars{0xc778}=0x1e8b; # small x with dot above
934	1	2µs			$chars{0xc779}=0x1e8f; # small y with dot above
935	1	1µs			$chars{0xc77a}=0x017c; # small z with dot above
936					# 4/8 trema, diaresis
937	1	2µs			$chars{0xc820}=0x00a8; # diaeresis
938	1	2µs			$chars{0xc841}=0x00c4; # capital a with diaeresis
939	1	2µs			$chars{0xc845}=0x00cb; # capital e with diaeresis
940	1	2µs			$chars{0xc848}=0x1e26; # capital h with diaeresis
941	1	2µs			$chars{0xc849}=0x00cf; # capital i with diaeresis
942	1	3µs			$chars{0xc84f}=0x00d6; # capital o with diaeresis
943	1	2µs			$chars{0xc855}=0x00dc; # capital u with diaeresis
944	1	2µs			$chars{0xc857}=0x1e84; # capital w with diaeresis
945	1	1µs			$chars{0xc858}=0x1e8c; # capital x with diaeresis
946	1	2µs			$chars{0xc859}=0x0178; # capital y with diaeresis
947	1	2µs			$chars{0xc861}=0x00e4; # small a with diaeresis
948	1	1µs			$chars{0xc865}=0x00eb; # small e with diaeresis
949	1	1µs			$chars{0xc868}=0x1e27; # small h with diaeresis
950	1	2µs			$chars{0xc869}=0x00ef; # small i with diaeresis
951	1	8µs			$chars{0xc86f}=0x00f6; # small o with diaeresis
952	1	2µs			$chars{0xc874}=0x1e97; # small t with diaeresis
953	1	1µs			$chars{0xc875}=0x00fc; # small u with diaeresis
954	1	2µs			$chars{0xc877}=0x1e85; # small w with diaeresis
955	1	1µs			$chars{0xc878}=0x1e8d; # small x with diaeresis
956	1	1µs			$chars{0xc879}=0x00ff; # small y with diaeresis
957					# 4/9 umlaut
958	1	1µs			$chars{0xc920}=0x00a8; # [diaeresis]
959	1	1µs			$chars{0xc961}=0x00e4; # a with umlaut
960	1	1µs			$chars{0xc965}=0x00eb; # e with umlaut
961	1	1µs			$chars{0xc969}=0x00ef; # i with umlaut
962	1	1µs			$chars{0xc96f}=0x00f6; # o with umlaut
963	1	1µs			$chars{0xc975}=0x00fc; # u with umlaut
964					# 4/10 circle above
965	1	1µs			$chars{0xca41}=0x00c5; # capital a with ring above
966	1	2µs			$chars{0xcaad}=0x016e; # capital u with ring above
967	1	1µs			$chars{0xca61}=0x00e5; # small a with ring above
968	1	1µs			$chars{0xca75}=0x016f; # small u with ring above
969	1	6µs			$chars{0xca77}=0x1e98; # small w with ring above
970	1	2µs			$chars{0xca79}=0x1e99; # small y with ring above
971					# 4/11 high comma off centre
972					# 4/12 inverted high comma centred
973					# 4/13 double acute accent
974	1	2µs			$chars{0xcd4f}=0x0150; # capital o with double acute
975	1	1µs			$chars{0xcd55}=0x0170; # capital u with double acute
976	1	2µs			$chars{0xcd6f}=0x0151; # small o with double acute
977	1	2µs			$chars{0xcd75}=0x0171; # small u with double acute
978					# 4/14 horn
979	1	2µs			$chars{0xce54}=0x01a0; # latin capital letter o with horn
980	1	2µs			$chars{0xce55}=0x01af; # latin capital letter u with horn
981	1	1µs			$chars{0xce74}=0x01a1; # latin small letter o with horn
982	1	900ns			$chars{0xce75}=0x01b0; # latin small letter u with horn
983					# 4/15 caron (hacek
984	1	900ns			$chars{0xcf41}=0x01cd; # capital a with caron
985	1	900ns			$chars{0xcf43}=0x010c; # capital c with caron
986	1	1µs			$chars{0xcf44}=0x010e; # capital d with caron
987	1	1µs			$chars{0xcf45}=0x011a; # capital e with caron
988	1	2µs			$chars{0xcf47}=0x01e6; # capital g with caron
989	1	900ns			$chars{0xcf49}=0x01cf; # capital i with caron
990	1	2µs			$chars{0xcf4b}=0x01e8; # capital k with caron
991	1	1µs			$chars{0xcf4c}=0x013d; # capital l with caron
992	1	2µs			$chars{0xcf4e}=0x0147; # capital n with caron
993	1	1µs			$chars{0xcf4f}=0x01d1; # capital o with caron
994	1	2µs			$chars{0xcf52}=0x0158; # capital r with caron
995	1	1µs			$chars{0xcf53}=0x0160; # capital s with caron
996	1	2µs			$chars{0xcf54}=0x0164; # capital t with caron
997	1	1µs			$chars{0xcf55}=0x01d3; # capital u with caron
998	1	1µs			$chars{0xcf5a}=0x017d; # capital z with caron
999	1	1µs			$chars{0xcf61}=0x01ce; # small a with caron
1000	1	1µs			$chars{0xcf63}=0x010d; # small c with caron
1001	1	1µs			$chars{0xcf64}=0x010f; # small d with caron
1002	1	1µs			$chars{0xcf65}=0x011b; # small e with caron
1003	1	2µs			$chars{0xcf67}=0x01e7; # small g with caron
1004	1	1µs			$chars{0xcf69}=0x01d0; # small i with caron
1005	1	2µs			$chars{0xcf6a}=0x01f0; # small j with caron
1006	1	2µs			$chars{0xcf6b}=0x01e9; # small k with caron
1007	1	1µs			$chars{0xcf6c}=0x013e; # small l with caron
1008	1	2µs			$chars{0xcf6e}=0x0148; # small n with caron
1009	1	2µs			$chars{0xcf6f}=0x01d2; # small o with caron
1010	1	1µs			$chars{0xcf72}=0x0159; # small r with caron
1011	1	2µs			$chars{0xcf73}=0x0161; # small s with caron
1012	1	1µs			$chars{0xcf74}=0x0165; # small t with caron
1013	1	2µs			$chars{0xcf75}=0x01d4; # small u with caron
1014	1	1µs			$chars{0xcf7a}=0x017e; # small z with caron
1015					# 5/0 cedilla
1016	1	900ns			$chars{0xd020}=0x00b8; # cedilla
1017	1	1µs			$chars{0xd043}=0x00c7; # capital c with cedilla
1018	1	1µs			$chars{0xd044}=0x1e10; # capital d with cedilla
1019	1	14µs			$chars{0xd047}=0x0122; # capital g with cedilla
1020	1	1µs			$chars{0xd048}=0x1e28; # capital h with cedilla
1021	1	2µs			$chars{0xd04b}=0x0136; # capital k with cedilla
1022	1	2µs			$chars{0xd04c}=0x013b; # capital l with cedilla
1023	1	900ns			$chars{0xd04e}=0x0145; # capital n with cedilla
1024	1	2µs			$chars{0xd052}=0x0156; # capital r with cedilla
1025	1	800ns			$chars{0xd053}=0x015e; # capital s with cedilla
1026	1	4µs			$chars{0xd054}=0x0162; # capital t with cedilla
1027	1	2µs			$chars{0xd063}=0x00e7; # small c with cedilla
1028	1	2µs			$chars{0xd064}=0x1e11; # small d with cedilla
1029	1	1µs			$chars{0xd065}=0x0119; # small e with cedilla
1030	1	1µs			$chars{0xd067}=0x0123; # small g with cedilla
1031	1	900ns			$chars{0xd068}=0x1e29; # small h with cedilla
1032	1	1µs			$chars{0xd06b}=0x0137; # small k with cedilla
1033	1	1µs			$chars{0xd06c}=0x013c; # small l with cedilla
1034	1	1µs			$chars{0xd06e}=0x0146; # small n with cedilla
1035	1	1µs			$chars{0xd072}=0x0157; # small r with cedilla
1036	1	6µs			$chars{0xd073}=0x015f; # small s with cedilla
1037	1	2µs			$chars{0xd074}=0x0163; # small t with cedilla
1038					# 5/1 rude
1039					# 5/2 hook to left
1040					# 5/3 ogonek (hook to right
1041	1	4µs			$chars{0xd320}=0x02db; # ogonek
1042	1	2µs			$chars{0xd341}=0x0104; # capital a with ogonek
1043	1	900ns			$chars{0xd345}=0x0118; # capital e with ogonek
1044	1	2µs			$chars{0xd349}=0x012e; # capital i with ogonek
1045	1	1µs			$chars{0xd34f}=0x01ea; # capital o with ogonek
1046	1	1µs			$chars{0xd355}=0x0172; # capital u with ogonek
1047	1	2µs			$chars{0xd361}=0x0105; # small a with ogonek
1048	1	1µs			$chars{0xd365}=0x0119; # small e with ogonek
1049	1	2µs			$chars{0xd369}=0x012f; # small i with ogonek
1050	1	1µs			$chars{0xd36f}=0x01eb; # small o with ogonek
1051	1	1µs			$chars{0xd375}=0x0173; # small u with ogonek
1052					# 5/4 circle below
1053	1	3µs			$chars{0xd441}=0x1e00; # capital a with ring below
1054	1	2µs			$chars{0xd461}=0x1e01; # small a with ring below
1055					# 5/5 half circle below
1056	1	1µs			$chars{0xf948}=0x1e2a; # capital h with breve below
1057	1	1µs			$chars{0xf968}=0x1e2b; # small h with breve below
1058					# 5/6 dot below
1059	1	2µs			$chars{0xd641}=0x1ea0; # capital a with dot below
1060	1	1µs			$chars{0xd642}=0x1e04; # capital b with dot below
1061	1	1µs			$chars{0xd644}=0x1e0c; # capital d with dot below
1062	1	1µs			$chars{0xd645}=0x1eb8; # capital e with dot below
1063	1	1µs			$chars{0xd648}=0x1e24; # capital h with dot below
1064	1	1µs			$chars{0xd649}=0x1eca; # capital i with dot below
1065	1	1µs			$chars{0xd64b}=0x1e32; # capital k with dot below
1066	1	2µs			$chars{0xd64c}=0x1e36; # capital l with dot below
1067	1	2µs			$chars{0xd64d}=0x1e42; # capital m with dot below
1068	1	2µs			$chars{0xd64e}=0x1e46; # capital n with dot below
1069	1	1µs			$chars{0xd64f}=0x1ecc; # capital o with dot below
1070	1	1µs			$chars{0xd652}=0x1e5a; # capital r with dot below
1071	1	2µs			$chars{0xd653}=0x1e62; # capital s with dot below
1072	1	3µs			$chars{0xd654}=0x1e6c; # capital t with dot below
1073	1	2µs			$chars{0xd655}=0x1ee4; # capital u with dot below
1074	1	2µs			$chars{0xd656}=0x1e7e; # capital v with dot below
1075	1	2µs			$chars{0xd657}=0x1e88; # capital w with dot below
1076	1	1µs			$chars{0xd659}=0x1ef4; # capital y with dot below
1077	1	1µs			$chars{0xd65a}=0x1e92; # capital z with dot below
1078	1	2µs			$chars{0xd661}=0x1ea1; # small a with dot below
1079	1	2µs			$chars{0xd662}=0x1e05; # small b with dot below
1080	1	2µs			$chars{0xd664}=0x1e0d; # small d with dot below
1081	1	2µs			$chars{0xd665}=0x1eb9; # small e with dot below
1082	1	1µs			$chars{0xd668}=0x1e25; # small h with dot below
1083	1	2µs			$chars{0xd669}=0x1ecb; # small i with dot below
1084	1	2µs			$chars{0xd66b}=0x1e33; # small k with dot below
1085	1	2µs			$chars{0xd66c}=0x1e37; # small l with dot below
1086	1	7µs			$chars{0xd66d}=0x1e43; # small m with dot below
1087	1	2µs			$chars{0xd66e}=0x1e47; # small n with dot below
1088	1	2µs			$chars{0xd66f}=0x1ecd; # small o with dot below
1089	1	2µs			$chars{0xd672}=0x1e5b; # small r with dot below
1090	1	1µs			$chars{0xd673}=0x1e63; # small s with dot below
1091	1	2µs			$chars{0xd674}=0x1e6d; # small t with dot below
1092	1	2µs			$chars{0xd675}=0x1ee5; # small u with dot below
1093	1	2µs			$chars{0xd676}=0x1e7f; # small v with dot below
1094	1	2µs			$chars{0xd677}=0x1e89; # small w with dot below
1095	1	2µs			$chars{0xd679}=0x1ef5; # small y with dot below
1096	1	1µs			$chars{0xd67a}=0x1e93; # small z with dot below
1097					# 5/7 double dot below
1098	1	2µs			$chars{0xd755}=0x1e72; # capital u with diaeresis below
1099	1	2µs			$chars{0xd775}=0x1e73; # small u with diaeresis below
1100					# 5/8 underline
1101	1	2µs			$chars{0xd820}=0x005f; # underline
1102					# 5/9 double underline
1103	1	2µs			$chars{0xd920}=0x2017; # double underline
1104					# 5/10 small low vertical bar
1105	1	1µs			$chars{0xda20}=0x02cc; #
1106					# 5/11 circumflex below
1107					# 5/12 (this position shall not be used)
1108					# 5/13 left half of ligature sign and of double tilde
1109					# 5/14 right half of ligature sign
1110					# 5/15 right half of double tilde
1111					# map {printf "%x :%x\n",$_,$chars{$_};}keys %chars;
1112
1113					sub char_decode5426 {
1114					my ( $string) = @_;
1115					my $result;
1116
1117					my @data = unpack("C*", $string);
1118					my @characters;
1119					my $length=scalar(@data);
1120					for (my $i = 0; $i < scalar(@data); $i++) {
1121					my $char= $data[$i];
1122					if ($char >= 0x00 && $char <= 0x7F){
1123					#IsAscii
1124
1125					push @characters,$char unless ($char<0x02 \|\|$char== 0x0F);
1126					}elsif (($char >= 0xC0 && $char <= 0xDF)) {
1127					#Combined Char
1128					my $convchar ;
1129					if ($chars{$char*256+$data[$i+1]}) {
1130					$convchar= $chars{$char * 256 + $data[$i+1]};
1131					$i++;
1132					# printf "char %x $char, char to convert %x , converted %x\n",$char,$char * 256 + $data[$i - 1],$convchar;
1133					} elsif ($chars{$char}) {
1134					$convchar= $chars{$char};
1135					# printf "0xC char %x, converted %x\n",$char,$chars{$char};
1136					}else {
1137					$convchar=$char;
1138					}
1139					push @characters,$convchar;
1140					} else {
1141					my $convchar;
1142					if ($chars{$char}) {
1143					$convchar= $chars{$char};
1144					# printf "char %x, converted %x\n",$char,$chars{$char};
1145					}else {
1146					# printf "char %x $char\n",$char;
1147					$convchar=$char;
1148					}
1149					push @characters,$convchar;
1150					}
1151					}
1152					$result=pack "U*",@characters;
1153					# $result=~s/\x01//;
1154					# $result=~s/\x00//;
1155					$result=~s/\x0f//;
1156					$result=~s/\x1b.//;
1157					$result=~s/\x0e//;
1158					$result=~s/\x1b\x5b//;
1159					# map{printf "%x",$_} @characters;
1160					# printf "\n";
1161					return $result;
1162					}
1163
1164	1	196µs			1;
1165
1166
1167					=head1 AUTHOR
1168
- -

					# spent 9.47ms (9.22+246µs) within C4::Charset::CORE:subst which was called 25 times, avg 379µs/call: # 25 times (9.22ms+246µs) by C4::Charset::StripNonXmlChars at line 385, avg 379µs/call sub C4::Charset::CORE:subst; # opcode