| Filename | /usr/share/koha/lib/C4/Charset.pm |
| Statements | Executed 12360 statements in 65.9ms |
| Calls | P | F | Exclusive Time |
Inclusive Time |
Subroutine |
|---|---|---|---|---|---|
| 25 | 1 | 1 | 25.4ms | 114ms | C4::Charset::SetUTF8Flag |
| 1319 | 1 | 1 | 16.1ms | 27.1ms | C4::Charset::NormalizeString |
| 25 | 1 | 1 | 9.22ms | 9.47ms | C4::Charset::CORE:subst (opcode) |
| 25 | 1 | 1 | 582µs | 10.0ms | C4::Charset::StripNonXmlChars |
| 1 | 1 | 1 | 340µs | 1.03ms | C4::Charset::BEGIN@24 |
| 1 | 1 | 1 | 33µs | 33µs | C4::Charset::BEGIN@30 |
| 1 | 1 | 1 | 26µs | 105µs | C4::Charset::BEGIN@26 |
| 1 | 1 | 1 | 24µs | 200µs | C4::Charset::BEGIN@25 |
| 1 | 1 | 1 | 21µs | 29µs | C4::Charset::BEGIN@20 |
| 1 | 1 | 1 | 18µs | 185µs | C4::Charset::BEGIN@28 |
| 1 | 1 | 1 | 15µs | 59µs | C4::Charset::BEGIN@23 |
| 1 | 1 | 1 | 13µs | 38µs | C4::Charset::BEGIN@21 |
| 0 | 0 | 0 | 0s | 0s | C4::Charset::IsStringUTF8ish |
| 0 | 0 | 0 | 0s | 0s | C4::Charset::MarcToUTF8Record |
| 0 | 0 | 0 | 0s | 0s | C4::Charset::SetMarcUnicodeFlag |
| 0 | 0 | 0 | 0s | 0s | C4::Charset::__ANON__[:542] |
| 0 | 0 | 0 | 0s | 0s | C4::Charset::_default_marc21_charconv_to_utf8 |
| 0 | 0 | 0 | 0s | 0s | C4::Charset::_default_unimarc_charconv_to_utf8 |
| 0 | 0 | 0 | 0s | 0s | C4::Charset::_marc_iso5426_to_utf8 |
| 0 | 0 | 0 | 0s | 0s | C4::Charset::_marc_marc8_to_utf8 |
| 0 | 0 | 0 | 0s | 0s | C4::Charset::_marc_to_utf8_replacement_char |
| 0 | 0 | 0 | 0s | 0s | C4::Charset::_marc_to_utf8_via_text_iconv |
| 0 | 0 | 0 | 0s | 0s | C4::Charset::char_decode5426 |
| 0 | 0 | 0 | 0s | 0s | C4::Charset::nsb_clean |
| Line | State ments |
Time on line |
Calls | Time in subs |
Code |
|---|---|---|---|---|---|
| 1 | package C4::Charset; | ||||
| 2 | |||||
| 3 | # Copyright (C) 2008 LibLime | ||||
| 4 | # | ||||
| 5 | # This file is part of Koha. | ||||
| 6 | # | ||||
| 7 | # Koha is free software; you can redistribute it and/or modify it under the | ||||
| 8 | # terms of the GNU General Public License as published by the Free Software | ||||
| 9 | # Foundation; either version 2 of the License, or (at your option) any later | ||||
| 10 | # version. | ||||
| 11 | # | ||||
| 12 | # Koha is distributed in the hope that it will be useful, but WITHOUT ANY | ||||
| 13 | # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR | ||||
| 14 | # A PARTICULAR PURPOSE. See the GNU General Public License for more details. | ||||
| 15 | # | ||||
| 16 | # You should have received a copy of the GNU General Public License along | ||||
| 17 | # with Koha; if not, write to the Free Software Foundation, Inc., | ||||
| 18 | # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
| 19 | |||||
| 20 | 3 | 33µs | 2 | 36µs | # spent 29µs (21+7) within C4::Charset::BEGIN@20 which was called:
# once (21µs+7µs) by C4::Biblio::BEGIN@37 at line 20 # spent 29µs making 1 call to C4::Charset::BEGIN@20
# spent 7µs making 1 call to strict::import |
| 21 | 3 | 34µs | 2 | 63µs | # spent 38µs (13+25) within C4::Charset::BEGIN@21 which was called:
# once (13µs+25µs) by C4::Biblio::BEGIN@37 at line 21 # spent 38µs making 1 call to C4::Charset::BEGIN@21
# spent 25µs making 1 call to warnings::import |
| 22 | |||||
| 23 | 3 | 36µs | 2 | 103µs | # spent 59µs (15+44) within C4::Charset::BEGIN@23 which was called:
# once (15µs+44µs) by C4::Biblio::BEGIN@37 at line 23 # spent 59µs making 1 call to C4::Charset::BEGIN@23
# spent 44µs making 1 call to Exporter::import |
| 24 | 3 | 149µs | 2 | 1.06ms | # spent 1.03ms (340µs+693µs) within C4::Charset::BEGIN@24 which was called:
# once (340µs+693µs) by C4::Biblio::BEGIN@37 at line 24 # spent 1.03ms making 1 call to C4::Charset::BEGIN@24
# spent 27µs making 1 call to Exporter::import |
| 25 | 3 | 65µs | 2 | 376µs | # spent 200µs (24+176) within C4::Charset::BEGIN@25 which was called:
# once (24µs+176µs) by C4::Biblio::BEGIN@37 at line 25 # spent 200µs making 1 call to C4::Charset::BEGIN@25
# spent 176µs making 1 call to Exporter::import |
| 26 | 3 | 75µs | 2 | 184µs | # spent 105µs (26+79) within C4::Charset::BEGIN@26 which was called:
# once (26µs+79µs) by C4::Biblio::BEGIN@37 at line 26 # spent 105µs making 1 call to C4::Charset::BEGIN@26
# spent 79µs making 1 call to Exporter::import |
| 27 | |||||
| 28 | 3 | 117µs | 2 | 352µs | # spent 185µs (18+167) within C4::Charset::BEGIN@28 which was called:
# once (18µs+167µs) by C4::Biblio::BEGIN@37 at line 28 # spent 185µs making 1 call to C4::Charset::BEGIN@28
# spent 167µs making 1 call to vars::import |
| 29 | |||||
| 30 | # spent 33µs within C4::Charset::BEGIN@30 which was called:
# once (33µs+0s) by C4::Biblio::BEGIN@37 at line 44 | ||||
| 31 | # set the version for version checking | ||||
| 32 | 4 | 33µs | $VERSION = 3.07.00.049; | ||
| 33 | require Exporter; | ||||
| 34 | @ISA = qw(Exporter); | ||||
| 35 | @EXPORT = qw( | ||||
| 36 | NormalizeString | ||||
| 37 | IsStringUTF8ish | ||||
| 38 | MarcToUTF8Record | ||||
| 39 | SetUTF8Flag | ||||
| 40 | SetMarcUnicodeFlag | ||||
| 41 | StripNonXmlChars | ||||
| 42 | nsb_clean | ||||
| 43 | ); | ||||
| 44 | 1 | 6.06ms | 1 | 33µs | } # spent 33µs making 1 call to C4::Charset::BEGIN@30 |
| 45 | |||||
| 46 | =head1 NAME | ||||
| 47 | |||||
| - - | |||||
| 77 | =head1 FUNCTIONS | ||||
| 78 | |||||
| - - | |||||
| 107 | sub IsStringUTF8ish { | ||||
| 108 | my $str = shift; | ||||
| 109 | |||||
| 110 | return 1 if utf8::is_utf8($str); | ||||
| 111 | return utf8::decode($str); | ||||
| 112 | } | ||||
| 113 | |||||
| 114 | =head2 SetUTF8Flag | ||||
| 115 | |||||
| - - | |||||
| 133 | # spent 114ms (25.4+88.6) within C4::Charset::SetUTF8Flag which was called 25 times, avg 4.56ms/call:
# 25 times (25.4ms+88.6ms) by C4::Search::searchResults at line 2011 of /usr/share/koha/lib/C4/Search.pm, avg 4.56ms/call | ||||
| 134 | 3986 | 20.9ms | my ($record, $nfd)=@_; | ||
| 135 | 25 | 176µs | return unless ($record && $record->fields()); # spent 176µs making 25 calls to MARC::Record::fields, avg 7µs/call | ||
| 136 | 25 | 180µs | foreach my $field ($record->fields()){ # spent 180µs making 25 calls to MARC::Record::fields, avg 7µs/call | ||
| 137 | 544 | 2.54ms | if ($field->tag()>=10){ # spent 2.54ms making 544 calls to MARC::Field::tag, avg 5µs/call | ||
| 138 | my @subfields; | ||||
| 139 | 512 | 11.1ms | foreach my $subfield ($field->subfields()){ # spent 11.1ms making 512 calls to MARC::Field::subfields, avg 22µs/call | ||
| 140 | 1319 | 27.1ms | push @subfields,($$subfield[0],NormalizeString($$subfield[1],$nfd)); # spent 27.1ms making 1319 calls to C4::Charset::NormalizeString, avg 21µs/call | ||
| 141 | } | ||||
| 142 | 2048 | 38.7ms | my $newfield=MARC::Field->new( # spent 26.4ms making 512 calls to MARC::Field::new, avg 52µs/call
# spent 10.0ms making 1024 calls to MARC::Field::indicator, avg 10µs/call
# spent 2.24ms making 512 calls to MARC::Field::tag, avg 4µs/call | ||
| 143 | $field->tag(), | ||||
| 144 | $field->indicator(1), | ||||
| 145 | $field->indicator(2), | ||||
| 146 | @subfields | ||||
| 147 | ); | ||||
| 148 | 512 | 8.87ms | $field->replace_with($newfield); # spent 8.87ms making 512 calls to MARC::Field::replace_with, avg 17µs/call | ||
| 149 | } | ||||
| 150 | } | ||||
| 151 | } | ||||
| 152 | |||||
| 153 | =head2 NormalizeString | ||||
| 154 | |||||
| - - | |||||
| 173 | # spent 27.1ms (16.1+11.0) within C4::Charset::NormalizeString which was called 1319 times, avg 21µs/call:
# 1319 times (16.1ms+11.0ms) by C4::Charset::SetUTF8Flag at line 140, avg 21µs/call | ||||
| 174 | 7914 | 27.8ms | my ($string,$nfd,$transform)=@_; | ||
| 175 | 2638 | 2.95ms | utf8::decode($string) unless (utf8::is_utf8($string)); # spent 1.70ms making 1319 calls to utf8::decode, avg 1µs/call
# spent 1.25ms making 1319 calls to utf8::is_utf8, avg 945ns/call | ||
| 176 | if ($nfd){ | ||||
| 177 | $string= NFD($string); | ||||
| 178 | } | ||||
| 179 | else { | ||||
| 180 | 1319 | 8.06ms | $string=NFC($string); # spent 8.06ms making 1319 calls to Unicode::Normalize::NFC, avg 6µs/call | ||
| 181 | } | ||||
| 182 | if ($transform){ | ||||
| 183 | $string=~s/\<|\>|\^|\;|\.|\?|,|\-|\(|\)|\[|\]|\{|\}|\$|\%|\!|\*|\:|\\|\/|\&|\"|\'/ /g; | ||||
| 184 | #removing one letter words "d'" "l'" was changed into "d " "l " | ||||
| 185 | $string=~s/\b\S\b//g; | ||||
| 186 | $string=~s/\s+$//g; | ||||
| 187 | } | ||||
| 188 | return $string; | ||||
| 189 | } | ||||
| 190 | |||||
| 191 | =head2 MarcToUTF8Record | ||||
| 192 | |||||
| - - | |||||
| 212 | sub MarcToUTF8Record { | ||||
| 213 | my $marc = shift; | ||||
| 214 | my $marc_flavour = shift; | ||||
| 215 | my $source_encoding = shift; | ||||
| 216 | my $marc_record; | ||||
| 217 | my $marc_blob_is_utf8 = 0; | ||||
| 218 | if (ref($marc) eq 'MARC::Record') { | ||||
| 219 | my $marc_blob = $marc->as_usmarc(); | ||||
| 220 | $marc_blob_is_utf8 = IsStringUTF8ish($marc_blob); | ||||
| 221 | $marc_record = $marc; | ||||
| 222 | } else { | ||||
| 223 | # dealing with a MARC blob | ||||
| 224 | |||||
| 225 | # remove any ersatz whitespace from the beginning and | ||||
| 226 | # end of the MARC blob -- these can creep into MARC | ||||
| 227 | # files produced by several sources -- caller really | ||||
| 228 | # should be doing this, however | ||||
| 229 | $marc =~ s/^\s+//; | ||||
| 230 | $marc =~ s/\s+$//; | ||||
| 231 | $marc_blob_is_utf8 = IsStringUTF8ish($marc); | ||||
| 232 | eval { | ||||
| 233 | $marc_record = MARC::Record->new_from_usmarc($marc); | ||||
| 234 | }; | ||||
| 235 | if ($@) { | ||||
| 236 | # if we fail the first time, one likely problem | ||||
| 237 | # is that we have a MARC21 record that says that it's | ||||
| 238 | # UTF-8 (Leader/09 = 'a') but contains non-UTF-8 characters. | ||||
| 239 | # We'll try parsing it again. | ||||
| 240 | substr($marc, 9, 1) = ' '; | ||||
| 241 | eval { | ||||
| 242 | $marc_record = MARC::Record->new_from_usmarc($marc); | ||||
| 243 | }; | ||||
| 244 | if ($@) { | ||||
| 245 | # it's hopeless; return an empty MARC::Record | ||||
| 246 | return MARC::Record->new(), 'failed', ['could not parse MARC blob']; | ||||
| 247 | } | ||||
| 248 | } | ||||
| 249 | } | ||||
| 250 | |||||
| 251 | # If we do not know the source encoding, try some guesses | ||||
| 252 | # as follows: | ||||
| 253 | # 1. Record is UTF-8 already. | ||||
| 254 | # 2. If MARC flavor is MARC21 or NORMARC, then | ||||
| 255 | # a. record is MARC-8 | ||||
| 256 | # b. record is ISO-8859-1 | ||||
| 257 | # 3. If MARC flavor is UNIMARC, then | ||||
| 258 | if (not defined $source_encoding) { | ||||
| 259 | if ($marc_blob_is_utf8) { | ||||
| 260 | # note that for MARC21/NORMARC we are not bothering to check | ||||
| 261 | # if the Leader/09 is set to 'a' or not -- because | ||||
| 262 | # of problems with various ILSs (including Koha in the | ||||
| 263 | # past, alas), this just is not trustworthy. | ||||
| 264 | SetMarcUnicodeFlag($marc_record, $marc_flavour); | ||||
| 265 | return $marc_record, 'UTF-8', []; | ||||
| 266 | } else { | ||||
| 267 | if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') { | ||||
| 268 | return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour); | ||||
| 269 | } elsif ($marc_flavour =~/UNIMARC/) { | ||||
| 270 | return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour); | ||||
| 271 | } else { | ||||
| 272 | return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour); | ||||
| 273 | } | ||||
| 274 | } | ||||
| 275 | } else { | ||||
| 276 | # caller knows the character encoding | ||||
| 277 | my $original_marc_record = $marc_record->clone(); | ||||
| 278 | my @errors; | ||||
| 279 | if ($source_encoding =~ /utf-?8/i) { | ||||
| 280 | if ($marc_blob_is_utf8) { | ||||
| 281 | SetMarcUnicodeFlag($marc_record, $marc_flavour); | ||||
| 282 | return $marc_record, 'UTF-8', []; | ||||
| 283 | } else { | ||||
| 284 | push @errors, 'specified UTF-8 => UTF-8 conversion, but record is not in UTF-8'; | ||||
| 285 | } | ||||
| 286 | } elsif ($source_encoding =~ /marc-?8/i) { | ||||
| 287 | @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour); | ||||
| 288 | } elsif ($source_encoding =~ /5426/) { | ||||
| 289 | @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour); | ||||
| 290 | } else { | ||||
| 291 | # assume any other character encoding is for Text::Iconv | ||||
| 292 | @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding); | ||||
| 293 | } | ||||
| 294 | |||||
| 295 | if (@errors) { | ||||
| 296 | _marc_to_utf8_replacement_char($original_marc_record, $marc_flavour); | ||||
| 297 | return $original_marc_record, 'failed', \@errors; | ||||
| 298 | } else { | ||||
| 299 | return $marc_record, $source_encoding, []; | ||||
| 300 | } | ||||
| 301 | } | ||||
| 302 | |||||
| 303 | } | ||||
| 304 | |||||
| 305 | =head2 SetMarcUnicodeFlag | ||||
| 306 | |||||
| - - | |||||
| 317 | sub SetMarcUnicodeFlag { | ||||
| 318 | my $marc_record = shift; | ||||
| 319 | my $marc_flavour = shift; # || C4::Context->preference("marcflavour"); | ||||
| 320 | |||||
| 321 | $marc_record->encoding('UTF-8'); | ||||
| 322 | if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') { | ||||
| 323 | my $leader = $marc_record->leader(); | ||||
| 324 | substr($leader, 9, 1) = 'a'; | ||||
| 325 | $marc_record->leader($leader); | ||||
| 326 | } elsif ($marc_flavour =~/UNIMARC/) { | ||||
| 327 | my $defaultlanguage = C4::Context->preference("UNIMARCField100Language"); | ||||
| 328 | $defaultlanguage = "fre" if (!$defaultlanguage || length($defaultlanguage) != 3); | ||||
| 329 | my $string; | ||||
| 330 | my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?(21,12):(36,25)); | ||||
| 331 | $string=$marc_record->subfield( 100, "a" ); | ||||
| 332 | if (defined $string && length($string)==$subflength) { | ||||
| 333 | $string = substr $string, 0,$subflength if (length($string)>$subflength); | ||||
| 334 | } | ||||
| 335 | else { | ||||
| 336 | $string = POSIX::strftime( "%Y%m%d", localtime ); | ||||
| 337 | $string =~ s/\-//g; | ||||
| 338 | $string = sprintf( "%-*s", $subflength, $string ); | ||||
| 339 | substr ( $string, ($encodingposition - 3), 3, $defaultlanguage); | ||||
| 340 | } | ||||
| 341 | substr( $string, $encodingposition, 3, "y50" ); | ||||
| 342 | if ( $marc_record->subfield( 100, "a" ) ) { | ||||
| 343 | $marc_record->field('100')->update(a=>$string); | ||||
| 344 | } | ||||
| 345 | else { | ||||
| 346 | $marc_record->insert_grouped_field( | ||||
| 347 | MARC::Field->new( 100, '', '', "a" => $string ) ); | ||||
| 348 | } | ||||
| 349 | $debug && warn "encodage: ", substr( $marc_record->subfield(100, 'a'), $encodingposition, 3 ); | ||||
| 350 | } else { | ||||
| 351 | warn "Unrecognized marcflavour: $marc_flavour"; | ||||
| 352 | } | ||||
| 353 | } | ||||
| 354 | |||||
| 355 | =head2 StripNonXmlChars | ||||
| 356 | |||||
| - - | |||||
| 380 | # spent 10.0ms (582µs+9.47) within C4::Charset::StripNonXmlChars which was called 25 times, avg 402µs/call:
# 25 times (582µs+9.47ms) by C4::Biblio::GetMarcBiblio at line 1258 of /usr/share/koha/lib/C4/Biblio.pm, avg 402µs/call | ||||
| 381 | 100 | 8.96ms | my $str = shift; | ||
| 382 | if (!defined($str) || $str eq ""){ | ||||
| 383 | return ""; | ||||
| 384 | } | ||||
| 385 | 1 | 796µs | 26 | 9.71ms | $str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g; # spent 9.47ms making 25 calls to C4::Charset::CORE:subst, avg 379µs/call
# spent 246µs making 1 call to utf8::SWASHNEW |
| 386 | return $str; | ||||
| 387 | } | ||||
| 388 | |||||
| - - | |||||
| 391 | =head2 nsb_clean | ||||
| 392 | |||||
| - - | |||||
| 402 | sub nsb_clean { | ||||
| 403 | my $NSB = '\x88' ; # NSB : begin Non Sorting Block | ||||
| 404 | my $NSE = '\x89' ; # NSE : Non Sorting Block end | ||||
| 405 | my $NSB2 = '\x98' ; # NSB : begin Non Sorting Block | ||||
| 406 | my $NSE2 = '\x9C' ; # NSE : Non Sorting Block end | ||||
| 407 | my $C2 = '\xC2' ; # What is this char ? It is sometimes left by the regexp after removing NSB / NSE | ||||
| 408 | |||||
| 409 | # handles non sorting blocks | ||||
| 410 | my ($string) = @_ ; | ||||
| 411 | $_ = $string ; | ||||
| 412 | s/$NSB//g ; | ||||
| 413 | s/$NSE//g ; | ||||
| 414 | s/$NSB2//g ; | ||||
| 415 | s/$NSE2//g ; | ||||
| 416 | s/$C2//g ; | ||||
| 417 | $string = $_ ; | ||||
| 418 | |||||
| 419 | return($string) ; | ||||
| 420 | } | ||||
| 421 | |||||
| 422 | |||||
| 423 | =head1 INTERNAL FUNCTIONS | ||||
| 424 | |||||
| - - | |||||
| 441 | sub _default_marc21_charconv_to_utf8 { | ||||
| 442 | my $marc_record = shift; | ||||
| 443 | my $marc_flavour = shift; | ||||
| 444 | |||||
| 445 | my $trial_marc8 = $marc_record->clone(); | ||||
| 446 | my @all_errors = (); | ||||
| 447 | my @errors = _marc_marc8_to_utf8($trial_marc8, $marc_flavour); | ||||
| 448 | unless (@errors) { | ||||
| 449 | return $trial_marc8, 'MARC-8', []; | ||||
| 450 | } | ||||
| 451 | push @all_errors, @errors; | ||||
| 452 | |||||
| 453 | my $trial_8859_1 = $marc_record->clone(); | ||||
| 454 | @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1'); | ||||
| 455 | unless (@errors) { | ||||
| 456 | return $trial_8859_1, 'iso-8859-1', []; # note -- we could return \@all_errors | ||||
| 457 | # instead if we wanted to report details | ||||
| 458 | # of the failed attempt at MARC-8 => UTF-8 | ||||
| 459 | } | ||||
| 460 | push @all_errors, @errors; | ||||
| 461 | |||||
| 462 | my $default_converted = $marc_record->clone(); | ||||
| 463 | _marc_to_utf8_replacement_char($default_converted, $marc_flavour); | ||||
| 464 | return $default_converted, 'failed', \@all_errors; | ||||
| 465 | } | ||||
| 466 | |||||
| 467 | =head2 _default_unimarc_charconv_to_utf8 | ||||
| 468 | |||||
| - - | |||||
| 483 | sub _default_unimarc_charconv_to_utf8 { | ||||
| 484 | my $marc_record = shift; | ||||
| 485 | my $marc_flavour = shift; | ||||
| 486 | |||||
| 487 | my $trial_marc8 = $marc_record->clone(); | ||||
| 488 | my @all_errors = (); | ||||
| 489 | my @errors = _marc_iso5426_to_utf8($trial_marc8, $marc_flavour); | ||||
| 490 | unless (@errors) { | ||||
| 491 | return $trial_marc8, 'iso-5426'; | ||||
| 492 | } | ||||
| 493 | push @all_errors, @errors; | ||||
| 494 | |||||
| 495 | my $trial_8859_1 = $marc_record->clone(); | ||||
| 496 | @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1'); | ||||
| 497 | unless (@errors) { | ||||
| 498 | return $trial_8859_1, 'iso-8859-1'; | ||||
| 499 | } | ||||
| 500 | push @all_errors, @errors; | ||||
| 501 | |||||
| 502 | my $default_converted = $marc_record->clone(); | ||||
| 503 | _marc_to_utf8_replacement_char($default_converted, $marc_flavour); | ||||
| 504 | return $default_converted, 'failed', \@all_errors; | ||||
| 505 | } | ||||
| 506 | |||||
| 507 | =head2 _marc_marc8_to_utf8 | ||||
| 508 | |||||
| - - | |||||
| 518 | sub _marc_marc8_to_utf8 { | ||||
| 519 | my $marc_record = shift; | ||||
| 520 | my $marc_flavour = shift; | ||||
| 521 | |||||
| 522 | my $prev_ignore = MARC::Charset->ignore_errors(); | ||||
| 523 | MARC::Charset->ignore_errors(1); | ||||
| 524 | |||||
| 525 | # trap warnings raised by MARC::Charset | ||||
| 526 | my @errors = (); | ||||
| 527 | local $SIG{__WARN__} = sub { | ||||
| 528 | my $msg = $_[0]; | ||||
| 529 | if ($msg =~ /MARC.Charset/) { | ||||
| 530 | # FIXME - purpose of this regexp is to strip out the | ||||
| 531 | # line reference to MARC/Charset.pm, but as it | ||||
| 532 | # exists probably won't work quite on Windows -- | ||||
| 533 | # some sort of minimal-bunch back-tracking RE | ||||
| 534 | # would be helpful here | ||||
| 535 | $msg =~ s/at [\/].*?.MARC.Charset\.pm line \d+\.\n$//; | ||||
| 536 | push @errors, $msg; | ||||
| 537 | } else { | ||||
| 538 | # if warning doesn't come from MARC::Charset, just | ||||
| 539 | # pass it on | ||||
| 540 | warn $msg; | ||||
| 541 | } | ||||
| 542 | }; | ||||
| 543 | |||||
| 544 | foreach my $field ($marc_record->fields()) { | ||||
| 545 | if ($field->is_control_field()) { | ||||
| 546 | ; # do nothing -- control fields should not contain non-ASCII characters | ||||
| 547 | } else { | ||||
| 548 | my @converted_subfields; | ||||
| 549 | foreach my $subfield ($field->subfields()) { | ||||
| 550 | my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]); | ||||
| 551 | unless (IsStringUTF8ish($utf8sf)) { | ||||
| 552 | # Because of a bug in MARC::Charset 0.98, if the string | ||||
| 553 | # has (a) one or more diacritics that (b) are only in character positions | ||||
| 554 | # 128 to 255 inclusive, the resulting converted string is not in | ||||
| 555 | # UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1). If that | ||||
| 556 | # occurs, upgrade the string in place. Moral of the story seems to be | ||||
| 557 | # that pack("U", ...) is better than chr(...) if you need to guarantee | ||||
| 558 | # that the resulting string is UTF-8. | ||||
| 559 | utf8::upgrade($utf8sf); | ||||
| 560 | } | ||||
| 561 | push @converted_subfields, $subfield->[0], $utf8sf; | ||||
| 562 | } | ||||
| 563 | |||||
| 564 | $field->replace_with(MARC::Field->new( | ||||
| 565 | $field->tag(), $field->indicator(1), $field->indicator(2), | ||||
| 566 | @converted_subfields) | ||||
| 567 | ); | ||||
| 568 | } | ||||
| 569 | } | ||||
| 570 | |||||
| 571 | MARC::Charset->ignore_errors($prev_ignore); | ||||
| 572 | |||||
| 573 | SetMarcUnicodeFlag($marc_record, $marc_flavour); | ||||
| 574 | |||||
| 575 | return @errors; | ||||
| 576 | } | ||||
| 577 | |||||
| 578 | =head2 _marc_iso5426_to_utf8 | ||||
| 579 | |||||
| - - | |||||
| 592 | sub _marc_iso5426_to_utf8 { | ||||
| 593 | my $marc_record = shift; | ||||
| 594 | my $marc_flavour = shift; | ||||
| 595 | |||||
| 596 | my @errors = (); | ||||
| 597 | |||||
| 598 | foreach my $field ($marc_record->fields()) { | ||||
| 599 | if ($field->is_control_field()) { | ||||
| 600 | ; # do nothing -- control fields should not contain non-ASCII characters | ||||
| 601 | } else { | ||||
| 602 | my @converted_subfields; | ||||
| 603 | foreach my $subfield ($field->subfields()) { | ||||
| 604 | my $utf8sf = char_decode5426($subfield->[1]); | ||||
| 605 | push @converted_subfields, $subfield->[0], $utf8sf; | ||||
| 606 | } | ||||
| 607 | |||||
| 608 | $field->replace_with(MARC::Field->new( | ||||
| 609 | $field->tag(), $field->indicator(1), $field->indicator(2), | ||||
| 610 | @converted_subfields) | ||||
| 611 | ); | ||||
| 612 | } | ||||
| 613 | } | ||||
| 614 | |||||
| 615 | SetMarcUnicodeFlag($marc_record, $marc_flavour); | ||||
| 616 | |||||
| 617 | return @errors; | ||||
| 618 | } | ||||
| 619 | |||||
| 620 | =head2 _marc_to_utf8_via_text_iconv | ||||
| 621 | |||||
| - - | |||||
| 634 | sub _marc_to_utf8_via_text_iconv { | ||||
| 635 | my $marc_record = shift; | ||||
| 636 | my $marc_flavour = shift; | ||||
| 637 | my $source_encoding = shift; | ||||
| 638 | |||||
| 639 | my @errors = (); | ||||
| 640 | my $decoder; | ||||
| 641 | eval { $decoder = Text::Iconv->new($source_encoding, 'utf8'); }; | ||||
| 642 | if ($@) { | ||||
| 643 | push @errors, "Could not initialze $source_encoding => utf8 converter: $@"; | ||||
| 644 | return @errors; | ||||
| 645 | } | ||||
| 646 | |||||
| 647 | my $prev_raise_error = Text::Iconv->raise_error(); | ||||
| 648 | Text::Iconv->raise_error(1); | ||||
| 649 | |||||
| 650 | foreach my $field ($marc_record->fields()) { | ||||
| 651 | if ($field->is_control_field()) { | ||||
| 652 | ; # do nothing -- control fields should not contain non-ASCII characters | ||||
| 653 | } else { | ||||
| 654 | my @converted_subfields; | ||||
| 655 | foreach my $subfield ($field->subfields()) { | ||||
| 656 | my $converted_value; | ||||
| 657 | my $conversion_ok = 1; | ||||
| 658 | eval { $converted_value = $decoder->convert($subfield->[1]); }; | ||||
| 659 | if ($@) { | ||||
| 660 | $conversion_ok = 0; | ||||
| 661 | push @errors, $@; | ||||
| 662 | } elsif (not defined $converted_value) { | ||||
| 663 | $conversion_ok = 0; | ||||
| 664 | push @errors, "Text::Iconv conversion failed - retval is " . $decoder->retval(); | ||||
| 665 | } | ||||
| 666 | |||||
| 667 | if ($conversion_ok) { | ||||
| 668 | push @converted_subfields, $subfield->[0], $converted_value; | ||||
| 669 | } else { | ||||
| 670 | $converted_value = $subfield->[1]; | ||||
| 671 | $converted_value =~ s/[\200-\377]/\xef\xbf\xbd/g; | ||||
| 672 | push @converted_subfields, $subfield->[0], $converted_value; | ||||
| 673 | } | ||||
| 674 | } | ||||
| 675 | |||||
| 676 | $field->replace_with(MARC::Field->new( | ||||
| 677 | $field->tag(), $field->indicator(1), $field->indicator(2), | ||||
| 678 | @converted_subfields) | ||||
| 679 | ); | ||||
| 680 | } | ||||
| 681 | } | ||||
| 682 | |||||
| 683 | SetMarcUnicodeFlag($marc_record, $marc_flavour); | ||||
| 684 | Text::Iconv->raise_error($prev_raise_error); | ||||
| 685 | |||||
| 686 | return @errors; | ||||
| 687 | } | ||||
| 688 | |||||
| 689 | =head2 _marc_to_utf8_replacement_char | ||||
| 690 | |||||
| - - | |||||
| 703 | sub _marc_to_utf8_replacement_char { | ||||
| 704 | my $marc_record = shift; | ||||
| 705 | my $marc_flavour = shift; | ||||
| 706 | |||||
| 707 | foreach my $field ($marc_record->fields()) { | ||||
| 708 | if ($field->is_control_field()) { | ||||
| 709 | ; # do nothing -- control fields should not contain non-ASCII characters | ||||
| 710 | } else { | ||||
| 711 | my @converted_subfields; | ||||
| 712 | foreach my $subfield ($field->subfields()) { | ||||
| 713 | my $value = $subfield->[1]; | ||||
| 714 | $value =~ s/[\200-\377]/\xef\xbf\xbd/g; | ||||
| 715 | push @converted_subfields, $subfield->[0], $value; | ||||
| 716 | } | ||||
| 717 | |||||
| 718 | $field->replace_with(MARC::Field->new( | ||||
| 719 | $field->tag(), $field->indicator(1), $field->indicator(2), | ||||
| 720 | @converted_subfields) | ||||
| 721 | ); | ||||
| 722 | } | ||||
| 723 | } | ||||
| 724 | |||||
| 725 | SetMarcUnicodeFlag($marc_record, $marc_flavour); | ||||
| 726 | } | ||||
| 727 | |||||
| 728 | =head2 char_decode5426 | ||||
| 729 | |||||
| - - | |||||
| 737 | 1 | 900ns | my %chars; | ||
| 738 | 1 | 8µs | $chars{0xb0}=0x0101;#3/0ayn[ain] | ||
| 739 | 1 | 3µs | $chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove] | ||
| 740 | #$chars{0xb2}=0x00e0;#'à'; | ||||
| 741 | 1 | 2µs | $chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark | ||
| 742 | #$chars{0xb3}=0x00e7;#'ç'; | ||||
| 743 | 1 | 2µs | $chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark | ||
| 744 | # $chars{0xb4}='è'; | ||||
| 745 | 1 | 1µs | $chars{0xb4}=0x00e8; | ||
| 746 | 1 | 1µs | $chars{0xbd}=0x02b9; | ||
| 747 | 1 | 2µs | $chars{0xbe}=0x02ba; | ||
| 748 | # $chars{0xb5}='é'; | ||||
| 749 | 1 | 2µs | $chars{0xb5}=0x00e9; | ||
| 750 | 1 | 3µs | $chars{0x97}=0x003c;#3/2leftlowsinglequotationmark | ||
| 751 | 1 | 1µs | $chars{0x98}=0x003e;#3/2leftlowsinglequotationmark | ||
| 752 | 1 | 1µs | $chars{0xfa}=0x0153; #oe | ||
| 753 | 1 | 3µs | $chars{0xea}=0x0152; #oe | ||
| 754 | 1 | 2µs | $chars{0x81d1}=0x00b0; | ||
| 755 | |||||
| 756 | #### | ||||
| 757 | ## combined characters iso5426 | ||||
| 758 | |||||
| 759 | 1 | 1µs | $chars{0xc041}=0x1ea2; # capital a with hook above | ||
| 760 | 1 | 1µs | $chars{0xc045}=0x1eba; # capital e with hook above | ||
| 761 | 1 | 1µs | $chars{0xc049}=0x1ec8; # capital i with hook above | ||
| 762 | 1 | 2µs | $chars{0xc04f}=0x1ece; # capital o with hook above | ||
| 763 | 1 | 2µs | $chars{0xc055}=0x1ee6; # capital u with hook above | ||
| 764 | 1 | 2µs | $chars{0xc059}=0x1ef6; # capital y with hook above | ||
| 765 | 1 | 1µs | $chars{0xc061}=0x1ea3; # small a with hook above | ||
| 766 | 1 | 2µs | $chars{0xc065}=0x1ebb; # small e with hook above | ||
| 767 | 1 | 1µs | $chars{0xc069}=0x1ec9; # small i with hook above | ||
| 768 | 1 | 2µs | $chars{0xc06f}=0x1ecf; # small o with hook above | ||
| 769 | 1 | 2µs | $chars{0xc075}=0x1ee7; # small u with hook above | ||
| 770 | 1 | 2µs | $chars{0xc079}=0x1ef7; # small y with hook above | ||
| 771 | |||||
| 772 | # 4/1 grave accent | ||||
| 773 | 1 | 2µs | $chars{0xc141}=0x00c0; # capital a with grave accent | ||
| 774 | 1 | 9µs | $chars{0xc145}=0x00c8; # capital e with grave accent | ||
| 775 | 1 | 3µs | $chars{0xc149}=0x00cc; # capital i with grave accent | ||
| 776 | 1 | 1µs | $chars{0xc14f}=0x00d2; # capital o with grave accent | ||
| 777 | 1 | 2µs | $chars{0xc155}=0x00d9; # capital u with grave accent | ||
| 778 | 1 | 2µs | $chars{0xc157}=0x1e80; # capital w with grave | ||
| 779 | 1 | 3µs | $chars{0xc159}=0x1ef2; # capital y with grave | ||
| 780 | 1 | 2µs | $chars{0xc161}=0x00e0; # small a with grave accent | ||
| 781 | 1 | 2µs | $chars{0xc165}=0x00e8; # small e with grave accent | ||
| 782 | 1 | 2µs | $chars{0xc169}=0x00ec; # small i with grave accent | ||
| 783 | 1 | 2µs | $chars{0xc16f}=0x00f2; # small o with grave accent | ||
| 784 | 1 | 2µs | $chars{0xc175}=0x00f9; # small u with grave accent | ||
| 785 | 1 | 2µs | $chars{0xc177}=0x1e81; # small w with grave | ||
| 786 | 1 | 2µs | $chars{0xc179}=0x1ef3; # small y with grave | ||
| 787 | # 4/2 acute accent | ||||
| 788 | 1 | 2µs | $chars{0xc241}=0x00c1; # capital a with acute accent | ||
| 789 | 1 | 2µs | $chars{0xc243}=0x0106; # capital c with acute accent | ||
| 790 | 1 | 1µs | $chars{0xc245}=0x00c9; # capital e with acute accent | ||
| 791 | 1 | 2µs | $chars{0xc247}=0x01f4; # capital g with acute | ||
| 792 | 1 | 2µs | $chars{0xc249}=0x00cd; # capital i with acute accent | ||
| 793 | 1 | 2µs | $chars{0xc24b}=0x1e30; # capital k with acute | ||
| 794 | 1 | 3µs | $chars{0xc24c}=0x0139; # capital l with acute accent | ||
| 795 | 1 | 1µs | $chars{0xc24d}=0x1e3e; # capital m with acute | ||
| 796 | 1 | 3µs | $chars{0xc24e}=0x0143; # capital n with acute accent | ||
| 797 | 1 | 1µs | $chars{0xc24f}=0x00d3; # capital o with acute accent | ||
| 798 | 1 | 2µs | $chars{0xc250}=0x1e54; # capital p with acute | ||
| 799 | 1 | 2µs | $chars{0xc252}=0x0154; # capital r with acute accent | ||
| 800 | 1 | 1µs | $chars{0xc253}=0x015a; # capital s with acute accent | ||
| 801 | 1 | 1µs | $chars{0xc255}=0x00da; # capital u with acute accent | ||
| 802 | 1 | 1µs | $chars{0xc257}=0x1e82; # capital w with acute | ||
| 803 | 1 | 1µs | $chars{0xc259}=0x00dd; # capital y with acute accent | ||
| 804 | 1 | 1µs | $chars{0xc25a}=0x0179; # capital z with acute accent | ||
| 805 | 1 | 2µs | $chars{0xc261}=0x00e1; # small a with acute accent | ||
| 806 | 1 | 1µs | $chars{0xc263}=0x0107; # small c with acute accent | ||
| 807 | 1 | 1µs | $chars{0xc265}=0x00e9; # small e with acute accent | ||
| 808 | 1 | 1µs | $chars{0xc267}=0x01f5; # small g with acute | ||
| 809 | 1 | 2µs | $chars{0xc269}=0x00ed; # small i with acute accent | ||
| 810 | 1 | 2µs | $chars{0xc26b}=0x1e31; # small k with acute | ||
| 811 | 1 | 2µs | $chars{0xc26c}=0x013a; # small l with acute accent | ||
| 812 | 1 | 9µs | $chars{0xc26d}=0x1e3f; # small m with acute | ||
| 813 | 1 | 2µs | $chars{0xc26e}=0x0144; # small n with acute accent | ||
| 814 | 1 | 2µs | $chars{0xc26f}=0x00f3; # small o with acute accent | ||
| 815 | 1 | 1µs | $chars{0xc270}=0x1e55; # small p with acute | ||
| 816 | 1 | 1µs | $chars{0xc272}=0x0155; # small r with acute accent | ||
| 817 | 1 | 1µs | $chars{0xc273}=0x015b; # small s with acute accent | ||
| 818 | 1 | 2µs | $chars{0xc275}=0x00fa; # small u with acute accent | ||
| 819 | 1 | 1µs | $chars{0xc277}=0x1e83; # small w with acute | ||
| 820 | 1 | 1µs | $chars{0xc279}=0x00fd; # small y with acute accent | ||
| 821 | 1 | 1µs | $chars{0xc27a}=0x017a; # small z with acute accent | ||
| 822 | 1 | 2µs | $chars{0xc2e1}=0x01fc; # capital ae with acute | ||
| 823 | 1 | 1µs | $chars{0xc2f1}=0x01fd; # small ae with acute | ||
| 824 | # 4/3 circumflex accent | ||||
| 825 | 1 | 2µs | $chars{0xc341}=0x00c2; # capital a with circumflex accent | ||
| 826 | 1 | 1µs | $chars{0xc343}=0x0108; # capital c with circumflex | ||
| 827 | 1 | 2µs | $chars{0xc345}=0x00ca; # capital e with circumflex accent | ||
| 828 | 1 | 2µs | $chars{0xc347}=0x011c; # capital g with circumflex | ||
| 829 | 1 | 1µs | $chars{0xc348}=0x0124; # capital h with circumflex | ||
| 830 | 1 | 1µs | $chars{0xc349}=0x00ce; # capital i with circumflex accent | ||
| 831 | 1 | 2µs | $chars{0xc34a}=0x0134; # capital j with circumflex | ||
| 832 | 1 | 1µs | $chars{0xc34f}=0x00d4; # capital o with circumflex accent | ||
| 833 | 1 | 2µs | $chars{0xc353}=0x015c; # capital s with circumflex | ||
| 834 | 1 | 1µs | $chars{0xc355}=0x00db; # capital u with circumflex | ||
| 835 | 1 | 2µs | $chars{0xc357}=0x0174; # capital w with circumflex | ||
| 836 | 1 | 2µs | $chars{0xc359}=0x0176; # capital y with circumflex | ||
| 837 | 1 | 1µs | $chars{0xc35a}=0x1e90; # capital z with circumflex | ||
| 838 | 1 | 1µs | $chars{0xc361}=0x00e2; # small a with circumflex accent | ||
| 839 | 1 | 2µs | $chars{0xc363}=0x0109; # small c with circumflex | ||
| 840 | 1 | 2µs | $chars{0xc365}=0x00ea; # small e with circumflex accent | ||
| 841 | 1 | 1µs | $chars{0xc367}=0x011d; # small g with circumflex | ||
| 842 | 1 | 2µs | $chars{0xc368}=0x0125; # small h with circumflex | ||
| 843 | 1 | 2µs | $chars{0xc369}=0x00ee; # small i with circumflex accent | ||
| 844 | 1 | 2µs | $chars{0xc36a}=0x0135; # small j with circumflex | ||
| 845 | 1 | 2µs | $chars{0xc36e}=0x00f1; # small n with tilde | ||
| 846 | 1 | 2µs | $chars{0xc36f}=0x00f4; # small o with circumflex accent | ||
| 847 | 1 | 2µs | $chars{0xc373}=0x015d; # small s with circumflex | ||
| 848 | 1 | 2µs | $chars{0xc375}=0x00fb; # small u with circumflex | ||
| 849 | 1 | 2µs | $chars{0xc377}=0x0175; # small w with circumflex | ||
| 850 | 1 | 2µs | $chars{0xc379}=0x0177; # small y with circumflex | ||
| 851 | 1 | 1µs | $chars{0xc37a}=0x1e91; # small z with circumflex | ||
| 852 | # 4/4 tilde | ||||
| 853 | 1 | 2µs | $chars{0xc441}=0x00c3; # capital a with tilde | ||
| 854 | 1 | 1µs | $chars{0xc445}=0x1ebc; # capital e with tilde | ||
| 855 | 1 | 1µs | $chars{0xc449}=0x0128; # capital i with tilde | ||
| 856 | 1 | 1µs | $chars{0xc44e}=0x00d1; # capital n with tilde | ||
| 857 | 1 | 1µs | $chars{0xc44f}=0x00d5; # capital o with tilde | ||
| 858 | 1 | 2µs | $chars{0xc455}=0x0168; # capital u with tilde | ||
| 859 | 1 | 1µs | $chars{0xc456}=0x1e7c; # capital v with tilde | ||
| 860 | 1 | 2µs | $chars{0xc459}=0x1ef8; # capital y with tilde | ||
| 861 | 1 | 2µs | $chars{0xc461}=0x00e3; # small a with tilde | ||
| 862 | 1 | 2µs | $chars{0xc465}=0x1ebd; # small e with tilde | ||
| 863 | 1 | 8µs | $chars{0xc469}=0x0129; # small i with tilde | ||
| 864 | 1 | 2µs | $chars{0xc46e}=0x00f1; # small n with tilde | ||
| 865 | 1 | 1µs | $chars{0xc46f}=0x00f5; # small o with tilde | ||
| 866 | 1 | 1µs | $chars{0xc475}=0x0169; # small u with tilde | ||
| 867 | 1 | 2µs | $chars{0xc476}=0x1e7d; # small v with tilde | ||
| 868 | 1 | 1µs | $chars{0xc479}=0x1ef9; # small y with tilde | ||
| 869 | # 4/5 macron | ||||
| 870 | 1 | 2µs | $chars{0xc541}=0x0100; # capital a with macron | ||
| 871 | 1 | 2µs | $chars{0xc545}=0x0112; # capital e with macron | ||
| 872 | 1 | 2µs | $chars{0xc547}=0x1e20; # capital g with macron | ||
| 873 | 1 | 1µs | $chars{0xc549}=0x012a; # capital i with macron | ||
| 874 | 1 | 5µs | $chars{0xc54f}=0x014c; # capital o with macron | ||
| 875 | 1 | 2µs | $chars{0xc555}=0x016a; # capital u with macron | ||
| 876 | 1 | 2µs | $chars{0xc561}=0x0101; # small a with macron | ||
| 877 | 1 | 2µs | $chars{0xc565}=0x0113; # small e with macron | ||
| 878 | 1 | 2µs | $chars{0xc567}=0x1e21; # small g with macron | ||
| 879 | 1 | 2µs | $chars{0xc569}=0x012b; # small i with macron | ||
| 880 | 1 | 6µs | $chars{0xc56f}=0x014d; # small o with macron | ||
| 881 | 1 | 2µs | $chars{0xc575}=0x016b; # small u with macron | ||
| 882 | 1 | 2µs | $chars{0xc572}=0x0159; # small r with macron | ||
| 883 | 1 | 2µs | $chars{0xc5e1}=0x01e2; # capital ae with macron | ||
| 884 | 1 | 1µs | $chars{0xc5f1}=0x01e3; # small ae with macron | ||
| 885 | # 4/6 breve | ||||
| 886 | 1 | 2µs | $chars{0xc641}=0x0102; # capital a with breve | ||
| 887 | 1 | 2µs | $chars{0xc645}=0x0114; # capital e with breve | ||
| 888 | 1 | 2µs | $chars{0xc647}=0x011e; # capital g with breve | ||
| 889 | 1 | 2µs | $chars{0xc649}=0x012c; # capital i with breve | ||
| 890 | 1 | 1µs | $chars{0xc64f}=0x014e; # capital o with breve | ||
| 891 | 1 | 2µs | $chars{0xc655}=0x016c; # capital u with breve | ||
| 892 | 1 | 1µs | $chars{0xc661}=0x0103; # small a with breve | ||
| 893 | 1 | 2µs | $chars{0xc665}=0x0115; # small e with breve | ||
| 894 | 1 | 1µs | $chars{0xc667}=0x011f; # small g with breve | ||
| 895 | 1 | 2µs | $chars{0xc669}=0x012d; # small i with breve | ||
| 896 | 1 | 3µs | $chars{0xc66f}=0x014f; # small o with breve | ||
| 897 | 1 | 2µs | $chars{0xc675}=0x016d; # small u with breve | ||
| 898 | # 4/7 dot above | ||||
| 899 | 1 | 2µs | $chars{0xc7b0}=0x01e1; # Ain with dot above | ||
| 900 | 1 | 2µs | $chars{0xc742}=0x1e02; # capital b with dot above | ||
| 901 | 1 | 2µs | $chars{0xc743}=0x010a; # capital c with dot above | ||
| 902 | 1 | 2µs | $chars{0xc744}=0x1e0a; # capital d with dot above | ||
| 903 | 1 | 1µs | $chars{0xc745}=0x0116; # capital e with dot above | ||
| 904 | 1 | 1µs | $chars{0xc746}=0x1e1e; # capital f with dot above | ||
| 905 | 1 | 1µs | $chars{0xc747}=0x0120; # capital g with dot above | ||
| 906 | 1 | 2µs | $chars{0xc748}=0x1e22; # capital h with dot above | ||
| 907 | 1 | 2µs | $chars{0xc749}=0x0130; # capital i with dot above | ||
| 908 | 1 | 1µs | $chars{0xc74d}=0x1e40; # capital m with dot above | ||
| 909 | 1 | 1µs | $chars{0xc74e}=0x1e44; # capital n with dot above | ||
| 910 | 1 | 1µs | $chars{0xc750}=0x1e56; # capital p with dot above | ||
| 911 | 1 | 2µs | $chars{0xc752}=0x1e58; # capital r with dot above | ||
| 912 | 1 | 2µs | $chars{0xc753}=0x1e60; # capital s with dot above | ||
| 913 | 1 | 1µs | $chars{0xc754}=0x1e6a; # capital t with dot above | ||
| 914 | 1 | 2µs | $chars{0xc757}=0x1e86; # capital w with dot above | ||
| 915 | 1 | 6µs | $chars{0xc758}=0x1e8a; # capital x with dot above | ||
| 916 | 1 | 2µs | $chars{0xc759}=0x1e8e; # capital y with dot above | ||
| 917 | 1 | 1µs | $chars{0xc75a}=0x017b; # capital z with dot above | ||
| 918 | 1 | 2µs | $chars{0xc761}=0x0227; # small b with dot above | ||
| 919 | 1 | 1µs | $chars{0xc762}=0x1e03; # small b with dot above | ||
| 920 | 1 | 1µs | $chars{0xc763}=0x010b; # small c with dot above | ||
| 921 | 1 | 1µs | $chars{0xc764}=0x1e0b; # small d with dot above | ||
| 922 | 1 | 6µs | $chars{0xc765}=0x0117; # small e with dot above | ||
| 923 | 1 | 1µs | $chars{0xc766}=0x1e1f; # small f with dot above | ||
| 924 | 1 | 1µs | $chars{0xc767}=0x0121; # small g with dot above | ||
| 925 | 1 | 1µs | $chars{0xc768}=0x1e23; # small h with dot above | ||
| 926 | 1 | 2µs | $chars{0xc76d}=0x1e41; # small m with dot above | ||
| 927 | 1 | 1µs | $chars{0xc76e}=0x1e45; # small n with dot above | ||
| 928 | 1 | 1µs | $chars{0xc770}=0x1e57; # small p with dot above | ||
| 929 | 1 | 2µs | $chars{0xc772}=0x1e59; # small r with dot above | ||
| 930 | 1 | 2µs | $chars{0xc773}=0x1e61; # small s with dot above | ||
| 931 | 1 | 1µs | $chars{0xc774}=0x1e6b; # small t with dot above | ||
| 932 | 1 | 1µs | $chars{0xc777}=0x1e87; # small w with dot above | ||
| 933 | 1 | 1µs | $chars{0xc778}=0x1e8b; # small x with dot above | ||
| 934 | 1 | 2µs | $chars{0xc779}=0x1e8f; # small y with dot above | ||
| 935 | 1 | 1µs | $chars{0xc77a}=0x017c; # small z with dot above | ||
| 936 | # 4/8 trema, diaresis | ||||
| 937 | 1 | 2µs | $chars{0xc820}=0x00a8; # diaeresis | ||
| 938 | 1 | 2µs | $chars{0xc841}=0x00c4; # capital a with diaeresis | ||
| 939 | 1 | 2µs | $chars{0xc845}=0x00cb; # capital e with diaeresis | ||
| 940 | 1 | 2µs | $chars{0xc848}=0x1e26; # capital h with diaeresis | ||
| 941 | 1 | 2µs | $chars{0xc849}=0x00cf; # capital i with diaeresis | ||
| 942 | 1 | 3µs | $chars{0xc84f}=0x00d6; # capital o with diaeresis | ||
| 943 | 1 | 2µs | $chars{0xc855}=0x00dc; # capital u with diaeresis | ||
| 944 | 1 | 2µs | $chars{0xc857}=0x1e84; # capital w with diaeresis | ||
| 945 | 1 | 1µs | $chars{0xc858}=0x1e8c; # capital x with diaeresis | ||
| 946 | 1 | 2µs | $chars{0xc859}=0x0178; # capital y with diaeresis | ||
| 947 | 1 | 2µs | $chars{0xc861}=0x00e4; # small a with diaeresis | ||
| 948 | 1 | 1µs | $chars{0xc865}=0x00eb; # small e with diaeresis | ||
| 949 | 1 | 1µs | $chars{0xc868}=0x1e27; # small h with diaeresis | ||
| 950 | 1 | 2µs | $chars{0xc869}=0x00ef; # small i with diaeresis | ||
| 951 | 1 | 8µs | $chars{0xc86f}=0x00f6; # small o with diaeresis | ||
| 952 | 1 | 2µs | $chars{0xc874}=0x1e97; # small t with diaeresis | ||
| 953 | 1 | 1µs | $chars{0xc875}=0x00fc; # small u with diaeresis | ||
| 954 | 1 | 2µs | $chars{0xc877}=0x1e85; # small w with diaeresis | ||
| 955 | 1 | 1µs | $chars{0xc878}=0x1e8d; # small x with diaeresis | ||
| 956 | 1 | 1µs | $chars{0xc879}=0x00ff; # small y with diaeresis | ||
| 957 | # 4/9 umlaut | ||||
| 958 | 1 | 1µs | $chars{0xc920}=0x00a8; # [diaeresis] | ||
| 959 | 1 | 1µs | $chars{0xc961}=0x00e4; # a with umlaut | ||
| 960 | 1 | 1µs | $chars{0xc965}=0x00eb; # e with umlaut | ||
| 961 | 1 | 1µs | $chars{0xc969}=0x00ef; # i with umlaut | ||
| 962 | 1 | 1µs | $chars{0xc96f}=0x00f6; # o with umlaut | ||
| 963 | 1 | 1µs | $chars{0xc975}=0x00fc; # u with umlaut | ||
| 964 | # 4/10 circle above | ||||
| 965 | 1 | 1µs | $chars{0xca41}=0x00c5; # capital a with ring above | ||
| 966 | 1 | 2µs | $chars{0xcaad}=0x016e; # capital u with ring above | ||
| 967 | 1 | 1µs | $chars{0xca61}=0x00e5; # small a with ring above | ||
| 968 | 1 | 1µs | $chars{0xca75}=0x016f; # small u with ring above | ||
| 969 | 1 | 6µs | $chars{0xca77}=0x1e98; # small w with ring above | ||
| 970 | 1 | 2µs | $chars{0xca79}=0x1e99; # small y with ring above | ||
| 971 | # 4/11 high comma off centre | ||||
| 972 | # 4/12 inverted high comma centred | ||||
| 973 | # 4/13 double acute accent | ||||
| 974 | 1 | 2µs | $chars{0xcd4f}=0x0150; # capital o with double acute | ||
| 975 | 1 | 1µs | $chars{0xcd55}=0x0170; # capital u with double acute | ||
| 976 | 1 | 2µs | $chars{0xcd6f}=0x0151; # small o with double acute | ||
| 977 | 1 | 2µs | $chars{0xcd75}=0x0171; # small u with double acute | ||
| 978 | # 4/14 horn | ||||
| 979 | 1 | 2µs | $chars{0xce54}=0x01a0; # latin capital letter o with horn | ||
| 980 | 1 | 2µs | $chars{0xce55}=0x01af; # latin capital letter u with horn | ||
| 981 | 1 | 1µs | $chars{0xce74}=0x01a1; # latin small letter o with horn | ||
| 982 | 1 | 900ns | $chars{0xce75}=0x01b0; # latin small letter u with horn | ||
| 983 | # 4/15 caron (hacek | ||||
| 984 | 1 | 900ns | $chars{0xcf41}=0x01cd; # capital a with caron | ||
| 985 | 1 | 900ns | $chars{0xcf43}=0x010c; # capital c with caron | ||
| 986 | 1 | 1µs | $chars{0xcf44}=0x010e; # capital d with caron | ||
| 987 | 1 | 1µs | $chars{0xcf45}=0x011a; # capital e with caron | ||
| 988 | 1 | 2µs | $chars{0xcf47}=0x01e6; # capital g with caron | ||
| 989 | 1 | 900ns | $chars{0xcf49}=0x01cf; # capital i with caron | ||
| 990 | 1 | 2µs | $chars{0xcf4b}=0x01e8; # capital k with caron | ||
| 991 | 1 | 1µs | $chars{0xcf4c}=0x013d; # capital l with caron | ||
| 992 | 1 | 2µs | $chars{0xcf4e}=0x0147; # capital n with caron | ||
| 993 | 1 | 1µs | $chars{0xcf4f}=0x01d1; # capital o with caron | ||
| 994 | 1 | 2µs | $chars{0xcf52}=0x0158; # capital r with caron | ||
| 995 | 1 | 1µs | $chars{0xcf53}=0x0160; # capital s with caron | ||
| 996 | 1 | 2µs | $chars{0xcf54}=0x0164; # capital t with caron | ||
| 997 | 1 | 1µs | $chars{0xcf55}=0x01d3; # capital u with caron | ||
| 998 | 1 | 1µs | $chars{0xcf5a}=0x017d; # capital z with caron | ||
| 999 | 1 | 1µs | $chars{0xcf61}=0x01ce; # small a with caron | ||
| 1000 | 1 | 1µs | $chars{0xcf63}=0x010d; # small c with caron | ||
| 1001 | 1 | 1µs | $chars{0xcf64}=0x010f; # small d with caron | ||
| 1002 | 1 | 1µs | $chars{0xcf65}=0x011b; # small e with caron | ||
| 1003 | 1 | 2µs | $chars{0xcf67}=0x01e7; # small g with caron | ||
| 1004 | 1 | 1µs | $chars{0xcf69}=0x01d0; # small i with caron | ||
| 1005 | 1 | 2µs | $chars{0xcf6a}=0x01f0; # small j with caron | ||
| 1006 | 1 | 2µs | $chars{0xcf6b}=0x01e9; # small k with caron | ||
| 1007 | 1 | 1µs | $chars{0xcf6c}=0x013e; # small l with caron | ||
| 1008 | 1 | 2µs | $chars{0xcf6e}=0x0148; # small n with caron | ||
| 1009 | 1 | 2µs | $chars{0xcf6f}=0x01d2; # small o with caron | ||
| 1010 | 1 | 1µs | $chars{0xcf72}=0x0159; # small r with caron | ||
| 1011 | 1 | 2µs | $chars{0xcf73}=0x0161; # small s with caron | ||
| 1012 | 1 | 1µs | $chars{0xcf74}=0x0165; # small t with caron | ||
| 1013 | 1 | 2µs | $chars{0xcf75}=0x01d4; # small u with caron | ||
| 1014 | 1 | 1µs | $chars{0xcf7a}=0x017e; # small z with caron | ||
| 1015 | # 5/0 cedilla | ||||
| 1016 | 1 | 900ns | $chars{0xd020}=0x00b8; # cedilla | ||
| 1017 | 1 | 1µs | $chars{0xd043}=0x00c7; # capital c with cedilla | ||
| 1018 | 1 | 1µs | $chars{0xd044}=0x1e10; # capital d with cedilla | ||
| 1019 | 1 | 14µs | $chars{0xd047}=0x0122; # capital g with cedilla | ||
| 1020 | 1 | 1µs | $chars{0xd048}=0x1e28; # capital h with cedilla | ||
| 1021 | 1 | 2µs | $chars{0xd04b}=0x0136; # capital k with cedilla | ||
| 1022 | 1 | 2µs | $chars{0xd04c}=0x013b; # capital l with cedilla | ||
| 1023 | 1 | 900ns | $chars{0xd04e}=0x0145; # capital n with cedilla | ||
| 1024 | 1 | 2µs | $chars{0xd052}=0x0156; # capital r with cedilla | ||
| 1025 | 1 | 800ns | $chars{0xd053}=0x015e; # capital s with cedilla | ||
| 1026 | 1 | 4µs | $chars{0xd054}=0x0162; # capital t with cedilla | ||
| 1027 | 1 | 2µs | $chars{0xd063}=0x00e7; # small c with cedilla | ||
| 1028 | 1 | 2µs | $chars{0xd064}=0x1e11; # small d with cedilla | ||
| 1029 | 1 | 1µs | $chars{0xd065}=0x0119; # small e with cedilla | ||
| 1030 | 1 | 1µs | $chars{0xd067}=0x0123; # small g with cedilla | ||
| 1031 | 1 | 900ns | $chars{0xd068}=0x1e29; # small h with cedilla | ||
| 1032 | 1 | 1µs | $chars{0xd06b}=0x0137; # small k with cedilla | ||
| 1033 | 1 | 1µs | $chars{0xd06c}=0x013c; # small l with cedilla | ||
| 1034 | 1 | 1µs | $chars{0xd06e}=0x0146; # small n with cedilla | ||
| 1035 | 1 | 1µs | $chars{0xd072}=0x0157; # small r with cedilla | ||
| 1036 | 1 | 6µs | $chars{0xd073}=0x015f; # small s with cedilla | ||
| 1037 | 1 | 2µs | $chars{0xd074}=0x0163; # small t with cedilla | ||
| 1038 | # 5/1 rude | ||||
| 1039 | # 5/2 hook to left | ||||
| 1040 | # 5/3 ogonek (hook to right | ||||
| 1041 | 1 | 4µs | $chars{0xd320}=0x02db; # ogonek | ||
| 1042 | 1 | 2µs | $chars{0xd341}=0x0104; # capital a with ogonek | ||
| 1043 | 1 | 900ns | $chars{0xd345}=0x0118; # capital e with ogonek | ||
| 1044 | 1 | 2µs | $chars{0xd349}=0x012e; # capital i with ogonek | ||
| 1045 | 1 | 1µs | $chars{0xd34f}=0x01ea; # capital o with ogonek | ||
| 1046 | 1 | 1µs | $chars{0xd355}=0x0172; # capital u with ogonek | ||
| 1047 | 1 | 2µs | $chars{0xd361}=0x0105; # small a with ogonek | ||
| 1048 | 1 | 1µs | $chars{0xd365}=0x0119; # small e with ogonek | ||
| 1049 | 1 | 2µs | $chars{0xd369}=0x012f; # small i with ogonek | ||
| 1050 | 1 | 1µs | $chars{0xd36f}=0x01eb; # small o with ogonek | ||
| 1051 | 1 | 1µs | $chars{0xd375}=0x0173; # small u with ogonek | ||
| 1052 | # 5/4 circle below | ||||
| 1053 | 1 | 3µs | $chars{0xd441}=0x1e00; # capital a with ring below | ||
| 1054 | 1 | 2µs | $chars{0xd461}=0x1e01; # small a with ring below | ||
| 1055 | # 5/5 half circle below | ||||
| 1056 | 1 | 1µs | $chars{0xf948}=0x1e2a; # capital h with breve below | ||
| 1057 | 1 | 1µs | $chars{0xf968}=0x1e2b; # small h with breve below | ||
| 1058 | # 5/6 dot below | ||||
| 1059 | 1 | 2µs | $chars{0xd641}=0x1ea0; # capital a with dot below | ||
| 1060 | 1 | 1µs | $chars{0xd642}=0x1e04; # capital b with dot below | ||
| 1061 | 1 | 1µs | $chars{0xd644}=0x1e0c; # capital d with dot below | ||
| 1062 | 1 | 1µs | $chars{0xd645}=0x1eb8; # capital e with dot below | ||
| 1063 | 1 | 1µs | $chars{0xd648}=0x1e24; # capital h with dot below | ||
| 1064 | 1 | 1µs | $chars{0xd649}=0x1eca; # capital i with dot below | ||
| 1065 | 1 | 1µs | $chars{0xd64b}=0x1e32; # capital k with dot below | ||
| 1066 | 1 | 2µs | $chars{0xd64c}=0x1e36; # capital l with dot below | ||
| 1067 | 1 | 2µs | $chars{0xd64d}=0x1e42; # capital m with dot below | ||
| 1068 | 1 | 2µs | $chars{0xd64e}=0x1e46; # capital n with dot below | ||
| 1069 | 1 | 1µs | $chars{0xd64f}=0x1ecc; # capital o with dot below | ||
| 1070 | 1 | 1µs | $chars{0xd652}=0x1e5a; # capital r with dot below | ||
| 1071 | 1 | 2µs | $chars{0xd653}=0x1e62; # capital s with dot below | ||
| 1072 | 1 | 3µs | $chars{0xd654}=0x1e6c; # capital t with dot below | ||
| 1073 | 1 | 2µs | $chars{0xd655}=0x1ee4; # capital u with dot below | ||
| 1074 | 1 | 2µs | $chars{0xd656}=0x1e7e; # capital v with dot below | ||
| 1075 | 1 | 2µs | $chars{0xd657}=0x1e88; # capital w with dot below | ||
| 1076 | 1 | 1µs | $chars{0xd659}=0x1ef4; # capital y with dot below | ||
| 1077 | 1 | 1µs | $chars{0xd65a}=0x1e92; # capital z with dot below | ||
| 1078 | 1 | 2µs | $chars{0xd661}=0x1ea1; # small a with dot below | ||
| 1079 | 1 | 2µs | $chars{0xd662}=0x1e05; # small b with dot below | ||
| 1080 | 1 | 2µs | $chars{0xd664}=0x1e0d; # small d with dot below | ||
| 1081 | 1 | 2µs | $chars{0xd665}=0x1eb9; # small e with dot below | ||
| 1082 | 1 | 1µs | $chars{0xd668}=0x1e25; # small h with dot below | ||
| 1083 | 1 | 2µs | $chars{0xd669}=0x1ecb; # small i with dot below | ||
| 1084 | 1 | 2µs | $chars{0xd66b}=0x1e33; # small k with dot below | ||
| 1085 | 1 | 2µs | $chars{0xd66c}=0x1e37; # small l with dot below | ||
| 1086 | 1 | 7µs | $chars{0xd66d}=0x1e43; # small m with dot below | ||
| 1087 | 1 | 2µs | $chars{0xd66e}=0x1e47; # small n with dot below | ||
| 1088 | 1 | 2µs | $chars{0xd66f}=0x1ecd; # small o with dot below | ||
| 1089 | 1 | 2µs | $chars{0xd672}=0x1e5b; # small r with dot below | ||
| 1090 | 1 | 1µs | $chars{0xd673}=0x1e63; # small s with dot below | ||
| 1091 | 1 | 2µs | $chars{0xd674}=0x1e6d; # small t with dot below | ||
| 1092 | 1 | 2µs | $chars{0xd675}=0x1ee5; # small u with dot below | ||
| 1093 | 1 | 2µs | $chars{0xd676}=0x1e7f; # small v with dot below | ||
| 1094 | 1 | 2µs | $chars{0xd677}=0x1e89; # small w with dot below | ||
| 1095 | 1 | 2µs | $chars{0xd679}=0x1ef5; # small y with dot below | ||
| 1096 | 1 | 1µs | $chars{0xd67a}=0x1e93; # small z with dot below | ||
| 1097 | # 5/7 double dot below | ||||
| 1098 | 1 | 2µs | $chars{0xd755}=0x1e72; # capital u with diaeresis below | ||
| 1099 | 1 | 2µs | $chars{0xd775}=0x1e73; # small u with diaeresis below | ||
| 1100 | # 5/8 underline | ||||
| 1101 | 1 | 2µs | $chars{0xd820}=0x005f; # underline | ||
| 1102 | # 5/9 double underline | ||||
| 1103 | 1 | 2µs | $chars{0xd920}=0x2017; # double underline | ||
| 1104 | # 5/10 small low vertical bar | ||||
| 1105 | 1 | 1µs | $chars{0xda20}=0x02cc; # | ||
| 1106 | # 5/11 circumflex below | ||||
| 1107 | # 5/12 (this position shall not be used) | ||||
| 1108 | # 5/13 left half of ligature sign and of double tilde | ||||
| 1109 | # 5/14 right half of ligature sign | ||||
| 1110 | # 5/15 right half of double tilde | ||||
| 1111 | # map {printf "%x :%x\n",$_,$chars{$_};}keys %chars; | ||||
| 1112 | |||||
| 1113 | sub char_decode5426 { | ||||
| 1114 | my ( $string) = @_; | ||||
| 1115 | my $result; | ||||
| 1116 | |||||
| 1117 | my @data = unpack("C*", $string); | ||||
| 1118 | my @characters; | ||||
| 1119 | my $length=scalar(@data); | ||||
| 1120 | for (my $i = 0; $i < scalar(@data); $i++) { | ||||
| 1121 | my $char= $data[$i]; | ||||
| 1122 | if ($char >= 0x00 && $char <= 0x7F){ | ||||
| 1123 | #IsAscii | ||||
| 1124 | |||||
| 1125 | push @characters,$char unless ($char<0x02 ||$char== 0x0F); | ||||
| 1126 | }elsif (($char >= 0xC0 && $char <= 0xDF)) { | ||||
| 1127 | #Combined Char | ||||
| 1128 | my $convchar ; | ||||
| 1129 | if ($chars{$char*256+$data[$i+1]}) { | ||||
| 1130 | $convchar= $chars{$char * 256 + $data[$i+1]}; | ||||
| 1131 | $i++; | ||||
| 1132 | # printf "char %x $char, char to convert %x , converted %x\n",$char,$char * 256 + $data[$i - 1],$convchar; | ||||
| 1133 | } elsif ($chars{$char}) { | ||||
| 1134 | $convchar= $chars{$char}; | ||||
| 1135 | # printf "0xC char %x, converted %x\n",$char,$chars{$char}; | ||||
| 1136 | }else { | ||||
| 1137 | $convchar=$char; | ||||
| 1138 | } | ||||
| 1139 | push @characters,$convchar; | ||||
| 1140 | } else { | ||||
| 1141 | my $convchar; | ||||
| 1142 | if ($chars{$char}) { | ||||
| 1143 | $convchar= $chars{$char}; | ||||
| 1144 | # printf "char %x, converted %x\n",$char,$chars{$char}; | ||||
| 1145 | }else { | ||||
| 1146 | # printf "char %x $char\n",$char; | ||||
| 1147 | $convchar=$char; | ||||
| 1148 | } | ||||
| 1149 | push @characters,$convchar; | ||||
| 1150 | } | ||||
| 1151 | } | ||||
| 1152 | $result=pack "U*",@characters; | ||||
| 1153 | # $result=~s/\x01//; | ||||
| 1154 | # $result=~s/\x00//; | ||||
| 1155 | $result=~s/\x0f//; | ||||
| 1156 | $result=~s/\x1b.//; | ||||
| 1157 | $result=~s/\x0e//; | ||||
| 1158 | $result=~s/\x1b\x5b//; | ||||
| 1159 | # map{printf "%x",$_} @characters; | ||||
| 1160 | # printf "\n"; | ||||
| 1161 | return $result; | ||||
| 1162 | } | ||||
| 1163 | |||||
| 1164 | 1 | 196µs | 1; | ||
| 1165 | |||||
| 1166 | |||||
| 1167 | =head1 AUTHOR | ||||
| 1168 | |||||
| - - | |||||
# spent 9.47ms (9.22+246µs) within C4::Charset::CORE:subst which was called 25 times, avg 379µs/call:
# 25 times (9.22ms+246µs) by C4::Charset::StripNonXmlChars at line 385, avg 379µs/call |