| Filename | /usr/share/perl5/MARC/Charset.pm |
| Statements | Executed 34 statements in 2.29ms |
| Calls | P | F | Exclusive Time |
Inclusive Time |
Subroutine |
|---|---|---|---|---|---|
| 1 | 1 | 1 | 1.73ms | 3.79ms | MARC::Charset::BEGIN@12 |
| 1 | 1 | 1 | 1.02ms | 2.11ms | MARC::Charset::BEGIN@10 |
| 1 | 1 | 1 | 726µs | 9.88ms | MARC::Charset::BEGIN@13 |
| 1 | 1 | 1 | 23µs | 36µs | MARC::Charset::BEGIN@5 |
| 1 | 1 | 1 | 18µs | 224µs | MARC::Charset::BEGIN@11 |
| 1 | 1 | 1 | 17µs | 22µs | MARC::Charset::BEGIN@4 |
| 1 | 1 | 1 | 11µs | 277µs | MARC::Charset::BEGIN@14 |
| 1 | 1 | 1 | 10µs | 64µs | MARC::Charset::BEGIN@7 |
| 0 | 0 | 0 | 0s | 0s | MARC::Charset::_process_escape |
| 0 | 0 | 0 | 0s | 0s | MARC::Charset::assume_encoding |
| 0 | 0 | 0 | 0s | 0s | MARC::Charset::assume_unicode |
| 0 | 0 | 0 | 0s | 0s | MARC::Charset::ignore_errors |
| 0 | 0 | 0 | 0s | 0s | MARC::Charset::marc8_to_utf8 |
| 0 | 0 | 0 | 0s | 0s | MARC::Charset::reset_charsets |
| 0 | 0 | 0 | 0s | 0s | MARC::Charset::utf8_to_marc8 |
| Line | State ments |
Time on line |
Calls | Time in subs |
Code |
|---|---|---|---|---|---|
| 1 | package MARC::Charset; | ||||
| 2 | |||||
| 3 | 1 | 600ns | our $VERSION = '1.2'; | ||
| 4 | 3 | 28µs | 2 | 26µs | # spent 22µs (17+4) within MARC::Charset::BEGIN@4 which was called:
# once (17µs+4µs) by MARC::File::SAX::BEGIN@14 at line 4 # spent 22µs making 1 call to MARC::Charset::BEGIN@4
# spent 4µs making 1 call to strict::import |
| 5 | 3 | 31µs | 2 | 48µs | # spent 36µs (23+12) within MARC::Charset::BEGIN@5 which was called:
# once (23µs+12µs) by MARC::File::SAX::BEGIN@14 at line 5 # spent 36µs making 1 call to MARC::Charset::BEGIN@5
# spent 12µs making 1 call to warnings::import |
| 6 | |||||
| 7 | 3 | 47µs | 2 | 118µs | # spent 64µs (10+54) within MARC::Charset::BEGIN@7 which was called:
# once (10µs+54µs) by MARC::File::SAX::BEGIN@14 at line 7 # spent 64µs making 1 call to MARC::Charset::BEGIN@7
# spent 54µs making 1 call to base::import |
| 8 | 1 | 1µs | our @EXPORT_OK = qw(marc8_to_utf8 utf8_to_marc8); | ||
| 9 | |||||
| 10 | 3 | 356µs | 2 | 2.17ms | # spent 2.11ms (1.02+1.10) within MARC::Charset::BEGIN@10 which was called:
# once (1.02ms+1.10ms) by MARC::File::SAX::BEGIN@14 at line 10 # spent 2.11ms making 1 call to MARC::Charset::BEGIN@10
# spent 58µs making 1 call to Exporter::import |
| 11 | 3 | 44µs | 2 | 430µs | # spent 224µs (18+206) within MARC::Charset::BEGIN@11 which was called:
# once (18µs+206µs) by MARC::File::SAX::BEGIN@14 at line 11 # spent 224µs making 1 call to MARC::Charset::BEGIN@11
# spent 206µs making 1 call to Exporter::import |
| 12 | 3 | 143µs | 2 | 5.73ms | # spent 3.79ms (1.73+2.06) within MARC::Charset::BEGIN@12 which was called:
# once (1.73ms+2.06ms) by MARC::File::SAX::BEGIN@14 at line 12 # spent 3.79ms making 1 call to MARC::Charset::BEGIN@12
# spent 1.94ms making 1 call to charnames::import |
| 13 | 3 | 155µs | 1 | 9.88ms | # spent 9.88ms (726µs+9.15) within MARC::Charset::BEGIN@13 which was called:
# once (726µs+9.15ms) by MARC::File::SAX::BEGIN@14 at line 13 # spent 9.88ms making 1 call to MARC::Charset::BEGIN@13 |
| 14 | 3 | 1.47ms | 2 | 543µs | # spent 277µs (11+266) within MARC::Charset::BEGIN@14 which was called:
# once (11µs+266µs) by MARC::File::SAX::BEGIN@14 at line 14 # spent 277µs making 1 call to MARC::Charset::BEGIN@14
# spent 266µs making 1 call to Exporter::import |
| 15 | |||||
| 16 | =head1 NAME | ||||
| 17 | |||||
| - - | |||||
| 43 | # get the mapping table | ||||
| 44 | 1 | 3µs | 1 | 184µs | our $table = MARC::Charset::Table->new(); # spent 184µs making 1 call to MARC::Charset::Table::new |
| 45 | |||||
| 46 | # set default character sets | ||||
| 47 | # these are viewable at the package level | ||||
| 48 | # in case someone wants to set them | ||||
| 49 | 1 | 600ns | our $DEFAULT_G0 = ASCII_DEFAULT; | ||
| 50 | 1 | 400ns | our $DEFAULT_G1 = EXTENDED_LATIN; | ||
| 51 | |||||
| 52 | =head2 ignore_errors() | ||||
| 53 | |||||
| - - | |||||
| 66 | 1 | 400ns | our $_ignore_errors = 0; | ||
| 67 | sub ignore_errors { | ||||
| 68 | my ($self,$i) = @_; | ||||
| 69 | $_ignore_errors = $i if (defined($i)); | ||||
| 70 | return $_ignore_errors; | ||||
| 71 | } | ||||
| 72 | |||||
| 73 | |||||
| 74 | =head2 assume_unicode() | ||||
| 75 | |||||
| - - | |||||
| 89 | 1 | 300ns | our $_assume = ''; | ||
| 90 | sub assume_unicode { | ||||
| 91 | my ($self,$i) = @_; | ||||
| 92 | $_assume = 'utf8' if (defined($i) and $i); | ||||
| 93 | return 1 if ($_assume eq 'utf8'); | ||||
| 94 | } | ||||
| 95 | |||||
| 96 | |||||
| 97 | =head2 assume_encoding() | ||||
| 98 | |||||
| - - | |||||
| 111 | sub assume_encoding { | ||||
| 112 | my ($self,$i) = @_; | ||||
| 113 | $_assume = $i if (defined($i)); | ||||
| 114 | return $_assume; | ||||
| 115 | } | ||||
| 116 | |||||
| 117 | |||||
| 118 | # place holders for working graphical character sets | ||||
| 119 | 1 | 300ns | my $G0; | ||
| 120 | 1 | 200ns | my $G1; | ||
| 121 | |||||
| 122 | =head2 marc8_to_utf8() | ||||
| 123 | |||||
| - - | |||||
| 142 | sub marc8_to_utf8 | ||||
| 143 | { | ||||
| 144 | my ($marc8, $ignore_errors) = @_; | ||||
| 145 | reset_charsets(); | ||||
| 146 | |||||
| 147 | $ignore_errors = $_ignore_errors if (!defined($ignore_errors)); | ||||
| 148 | |||||
| 149 | # holder for our utf8 | ||||
| 150 | my $utf8 = ''; | ||||
| 151 | |||||
| 152 | my $index = 0; | ||||
| 153 | my $length = length($marc8); | ||||
| 154 | my $combining = ''; | ||||
| 155 | CHAR_LOOP: while ($index < $length) | ||||
| 156 | { | ||||
| 157 | # whitespace, line feeds and carriage returns just get added on unmolested | ||||
| 158 | if (substr($marc8, $index, 1) =~ m/(\s+|\x0A+|\x0D+)/so) | ||||
| 159 | { | ||||
| 160 | $utf8 .= $1; | ||||
| 161 | $index += 1; | ||||
| 162 | next CHAR_LOOP; | ||||
| 163 | } | ||||
| 164 | |||||
| 165 | # look for any escape sequences | ||||
| 166 | my $new_index = _process_escape(\$marc8, $index, $length); | ||||
| 167 | if ($new_index > $index) | ||||
| 168 | { | ||||
| 169 | $index = $new_index; | ||||
| 170 | next CHAR_LOOP; | ||||
| 171 | } | ||||
| 172 | |||||
| 173 | my $found; | ||||
| 174 | CHARSET_LOOP: foreach my $charset ($G0, $G1) | ||||
| 175 | { | ||||
| 176 | |||||
| 177 | # cjk characters are a string of three chars | ||||
| 178 | my $char_size = $charset eq CJK ? 3 : 1; | ||||
| 179 | |||||
| 180 | # extract the next code point to examine | ||||
| 181 | my $chunk = substr($marc8, $index, $char_size); | ||||
| 182 | |||||
| 183 | # look up the character to see if it's in our mapping | ||||
| 184 | my $code = $table->lookup_by_marc8($charset, $chunk); | ||||
| 185 | |||||
| 186 | # try the next character set if no mapping was found | ||||
| 187 | next CHARSET_LOOP if ! $code; | ||||
| 188 | $found = 1; | ||||
| 189 | |||||
| 190 | # gobble up all combining characters for appending later | ||||
| 191 | # this is necessary because combinging characters precede | ||||
| 192 | # the character they modifiy in MARC-8, whereas they follow | ||||
| 193 | # the character they modify in UTF-8. | ||||
| 194 | if ($code->is_combining()) | ||||
| 195 | { | ||||
| 196 | $combining .= $code->char_value(); | ||||
| 197 | } | ||||
| 198 | else | ||||
| 199 | { | ||||
| 200 | $utf8 .= $code->char_value() . $combining; | ||||
| 201 | $combining = ''; | ||||
| 202 | } | ||||
| 203 | |||||
| 204 | $index += $char_size; | ||||
| 205 | next CHAR_LOOP; | ||||
| 206 | } | ||||
| 207 | |||||
| 208 | if (!$found) | ||||
| 209 | { | ||||
| 210 | warn(sprintf("no mapping found for [0x\%X] at position $index in $marc8 ". | ||||
| 211 | "g0=".MARC::Charset::Constants::charset_name($G0) . " " . | ||||
| 212 | "g1=".MARC::Charset::Constants::charset_name($G1), unpack('C',substr($marc8,$index,1)))); | ||||
| 213 | if (!$ignore_errors) | ||||
| 214 | { | ||||
| 215 | reset_charsets(); | ||||
| 216 | return; | ||||
| 217 | } | ||||
| 218 | if ($_assume) | ||||
| 219 | { | ||||
| 220 | reset_charsets(); | ||||
| 221 | return NFC(decode($_assume => $marc8)); | ||||
| 222 | } | ||||
| 223 | $index += 1; | ||||
| 224 | } | ||||
| 225 | |||||
| 226 | } | ||||
| 227 | |||||
| 228 | # return the utf8 | ||||
| 229 | reset_charsets(); | ||||
| 230 | return $utf8; | ||||
| 231 | } | ||||
| 232 | |||||
| - - | |||||
| 235 | =head2 utf8_to_marc8() | ||||
| 236 | |||||
| - - | |||||
| 254 | sub utf8_to_marc8 | ||||
| 255 | { | ||||
| 256 | my ($utf8, $ignore_errors) = @_; | ||||
| 257 | reset_charsets(); | ||||
| 258 | |||||
| 259 | $ignore_errors = $_ignore_errors if (!defined($ignore_errors)); | ||||
| 260 | |||||
| 261 | # decompose combined characters | ||||
| 262 | $utf8 = NFD($utf8); | ||||
| 263 | |||||
| 264 | my $len = length($utf8); | ||||
| 265 | my $marc8 = ''; | ||||
| 266 | for (my $i=0; $i<$len; $i++) | ||||
| 267 | { | ||||
| 268 | my $slice = substr($utf8, $i, 1); | ||||
| 269 | |||||
| 270 | # spaces are copied from utf8 into marc8 | ||||
| 271 | if ($slice eq ' ') | ||||
| 272 | { | ||||
| 273 | $marc8 .= ' '; | ||||
| 274 | next; | ||||
| 275 | } | ||||
| 276 | |||||
| 277 | # try to find the code point in our mapping table | ||||
| 278 | my $code = $table->lookup_by_utf8($slice); | ||||
| 279 | |||||
| 280 | if (! $code) | ||||
| 281 | { | ||||
| 282 | warn("no mapping found at position $i in $utf8"); | ||||
| 283 | reset_charsets() and return unless $ignore_errors; | ||||
| 284 | } | ||||
| 285 | |||||
| 286 | # if it's a combining character move it around | ||||
| 287 | if ($code->is_combining()) | ||||
| 288 | { | ||||
| 289 | my $prev = chop($marc8); | ||||
| 290 | $marc8 .= $code->marc_value() . $prev; | ||||
| 291 | next; | ||||
| 292 | } | ||||
| 293 | |||||
| 294 | # look to see if we need to escape to a new G0 charset | ||||
| 295 | my $charset_value = $code->charset_value(); | ||||
| 296 | |||||
| 297 | if ($code->default_charset_group() eq 'G0' | ||||
| 298 | and $G0 ne $charset_value) | ||||
| 299 | { | ||||
| 300 | if ($G0 eq ASCII_DEFAULT and $charset_value eq BASIC_LATIN) | ||||
| 301 | { | ||||
| 302 | # don't bother escaping, they're functionally the same | ||||
| 303 | } | ||||
| 304 | else | ||||
| 305 | { | ||||
| 306 | $marc8 .= $code->get_escape(); | ||||
| 307 | $G0 = $charset_value; | ||||
| 308 | } | ||||
| 309 | } | ||||
| 310 | |||||
| 311 | # look to see if we need to escape to a new G1 charset | ||||
| 312 | elsif ($code->default_charset_group() eq 'G1' | ||||
| 313 | and $G1 ne $charset_value) | ||||
| 314 | { | ||||
| 315 | $marc8 .= $code->get_escape(); | ||||
| 316 | $G1 = $charset_value; | ||||
| 317 | } | ||||
| 318 | |||||
| 319 | $marc8 .= $code->marc_value(); | ||||
| 320 | } | ||||
| 321 | |||||
| 322 | # escape back to default G0 if necessary | ||||
| 323 | if ($G0 ne $DEFAULT_G0) | ||||
| 324 | { | ||||
| 325 | if ($DEFAULT_G0 eq ASCII_DEFAULT) { $marc8 .= ESCAPE . ASCII_DEFAULT; } | ||||
| 326 | elsif ($DEFAULT_G0 eq CJK) { $marc8 .= ESCAPE . MULTI_G0_A . CJK; } | ||||
| 327 | else { $marc8 .= ESCAPE . SINGLE_G0_A . $DEFAULT_G0; } | ||||
| 328 | } | ||||
| 329 | |||||
| 330 | # escape back to default G1 if necessary | ||||
| 331 | if ($G1 ne $DEFAULT_G1) | ||||
| 332 | { | ||||
| 333 | if ($DEFAULT_G1 eq CJK) { $marc8 .= ESCAPE . MULTI_G1_A . $DEFAULT_G1; } | ||||
| 334 | else { $marc8 .= ESCAPE . SINGLE_G1_A . $DEFAULT_G1; } | ||||
| 335 | } | ||||
| 336 | |||||
| 337 | return $marc8; | ||||
| 338 | } | ||||
| 339 | |||||
| - - | |||||
| 342 | =head1 DEFAULT CHARACTER SETS | ||||
| 343 | |||||
| - - | |||||
| 377 | sub _process_escape | ||||
| 378 | { | ||||
| 379 | ## this stuff is kind of scary ... for an explanation of what is | ||||
| 380 | ## going on here check out the MARC-8 specs at LC. | ||||
| 381 | ## http://lcweb.loc.gov/marc/specifications/speccharmarc8.html | ||||
| 382 | my ($str_ref, $left, $right) = @_; | ||||
| 383 | |||||
| 384 | # first char needs to be an escape or else this isn't an escape sequence | ||||
| 385 | return $left unless substr($$str_ref, $left, 1) eq ESCAPE; | ||||
| 386 | |||||
| 387 | ## if we don't have at least one character after the escape | ||||
| 388 | ## then this can't be a character escape sequence | ||||
| 389 | return $left if ($left+1 >= $right); | ||||
| 390 | |||||
| 391 | ## pull off the first escape | ||||
| 392 | my $esc_char_1 = substr($$str_ref, $left+1, 1); | ||||
| 393 | |||||
| 394 | ## the first method of escaping to small character sets | ||||
| 395 | if ( $esc_char_1 eq GREEK_SYMBOLS | ||||
| 396 | or $esc_char_1 eq SUBSCRIPTS | ||||
| 397 | or $esc_char_1 eq SUPERSCRIPTS | ||||
| 398 | or $esc_char_1 eq ASCII_DEFAULT) | ||||
| 399 | { | ||||
| 400 | $G0 = $esc_char_1; | ||||
| 401 | return $left+2; | ||||
| 402 | } | ||||
| 403 | |||||
| 404 | ## the second more complicated method of escaping to bigger charsets | ||||
| 405 | return $left if $left+2 >= $right; | ||||
| 406 | |||||
| 407 | my $esc_char_2 = substr($$str_ref, $left+2, 1); | ||||
| 408 | my $esc_chars = $esc_char_1 . $esc_char_2; | ||||
| 409 | |||||
| 410 | if ($esc_char_1 eq SINGLE_G0_A | ||||
| 411 | or $esc_char_1 eq SINGLE_G0_B) | ||||
| 412 | { | ||||
| 413 | $G0 = $esc_char_2; | ||||
| 414 | return $left+3; | ||||
| 415 | } | ||||
| 416 | |||||
| 417 | elsif ($esc_char_1 eq SINGLE_G1_A | ||||
| 418 | or $esc_char_1 eq SINGLE_G1_B) | ||||
| 419 | { | ||||
| 420 | $G1 = $esc_char_2; | ||||
| 421 | return $left+3; | ||||
| 422 | } | ||||
| 423 | |||||
| 424 | elsif ( $esc_char_1 eq MULTI_G0_A ) { | ||||
| 425 | $G0 = $esc_char_2; | ||||
| 426 | return $left+3; | ||||
| 427 | } | ||||
| 428 | |||||
| 429 | elsif ($esc_chars eq MULTI_G0_B | ||||
| 430 | and ($left+3 < $right)) | ||||
| 431 | { | ||||
| 432 | $G0 = substr($$str_ref, $left+3, 1); | ||||
| 433 | return $left+4; | ||||
| 434 | } | ||||
| 435 | |||||
| 436 | elsif (($esc_chars eq MULTI_G1_A or $esc_chars eq MULTI_G1_B) | ||||
| 437 | and ($left + 3 < $right)) | ||||
| 438 | { | ||||
| 439 | $G1 = substr($$str_ref, $left+3, 1); | ||||
| 440 | return $left+4; | ||||
| 441 | } | ||||
| 442 | |||||
| 443 | # we should never get here | ||||
| 444 | warn("seem to have fallen through in _process_escape()"); | ||||
| 445 | return $left; | ||||
| 446 | } | ||||
| 447 | |||||
| 448 | sub reset_charsets | ||||
| 449 | { | ||||
| 450 | $G0 = $DEFAULT_G0; | ||||
| 451 | $G1 = $DEFAULT_G1; | ||||
| 452 | } | ||||
| 453 | |||||
| 454 | 1 | 8µs | 1; |