← Index
NYTProf Performance Profile   « line view »
For svc/members/upsert
  Run on Tue Jan 13 11:50:22 2015
Reported on Tue Jan 13 12:09:49 2015

Filename/usr/share/perl5/MARC/Charset.pm
StatementsExecuted 26 statements in 4.42ms
Subroutines
Calls P F Exclusive
Time
Inclusive
Time
Subroutine
1111.59ms13.1msMARC::Charset::::BEGIN@14MARC::Charset::BEGIN@14
1111.40ms2.31msMARC::Charset::::BEGIN@11MARC::Charset::BEGIN@11
1111.04ms25.9msMARC::Charset::::BEGIN@13MARC::Charset::BEGIN@13
11111µs22µsMARC::Charset::::BEGIN@3MARC::Charset::BEGIN@3
1118µs28µsMARC::Charset::::BEGIN@12MARC::Charset::BEGIN@12
1118µs170µsMARC::Charset::::BEGIN@15MARC::Charset::BEGIN@15
1117µs12µsMARC::Charset::::BEGIN@4MARC::Charset::BEGIN@4
1117µs64µsMARC::Charset::::BEGIN@8MARC::Charset::BEGIN@8
0000s0sMARC::Charset::::_process_escapeMARC::Charset::_process_escape
0000s0sMARC::Charset::::assume_encodingMARC::Charset::assume_encoding
0000s0sMARC::Charset::::assume_unicodeMARC::Charset::assume_unicode
0000s0sMARC::Charset::::ignore_errorsMARC::Charset::ignore_errors
0000s0sMARC::Charset::::marc8_to_utf8MARC::Charset::marc8_to_utf8
0000s0sMARC::Charset::::reset_charsetsMARC::Charset::reset_charsets
0000s0sMARC::Charset::::utf8_to_marc8MARC::Charset::utf8_to_marc8
Call graph for these subroutines as a Graphviz dot language file.
Line State
ments
Time
on line
Calls Time
in subs
Code
1package MARC::Charset;
2
3222µs234µs
# spent 22µs (11+12) within MARC::Charset::BEGIN@3 which was called: # once (11µs+12µs) by MARC::File::XML::BEGIN@11 at line 3
use strict;
# spent 22µs making 1 call to MARC::Charset::BEGIN@3 # spent 12µs making 1 call to strict::import
4231µs216µs
# spent 12µs (7+5) within MARC::Charset::BEGIN@4 which was called: # once (7µs+5µs) by MARC::File::XML::BEGIN@11 at line 4
use warnings;
# spent 12µs making 1 call to MARC::Charset::BEGIN@4 # spent 5µs making 1 call to warnings::import
5
61700nsour $VERSION = '1.35';
7
8233µs2122µs
# spent 64µs (7+58) within MARC::Charset::BEGIN@8 which was called: # once (7µs+58µs) by MARC::File::XML::BEGIN@11 at line 8
use base qw(Exporter);
# spent 64µs making 1 call to MARC::Charset::BEGIN@8 # spent 58µs making 1 call to base::import
911µsour @EXPORT_OK = qw(marc8_to_utf8 utf8_to_marc8);
10
112755µs22.34ms
# spent 2.31ms (1.40+902µs) within MARC::Charset::BEGIN@11 which was called: # once (1.40ms+902µs) by MARC::File::XML::BEGIN@11 at line 11
use Unicode::Normalize;
# spent 2.31ms making 1 call to MARC::Charset::BEGIN@11 # spent 28µs making 1 call to Exporter::import
12224µs247µs
# spent 28µs (8+20) within MARC::Charset::BEGIN@12 which was called: # once (8µs+20µs) by MARC::File::XML::BEGIN@11 at line 12
use Encode 'decode';
# spent 28µs making 1 call to MARC::Charset::BEGIN@12 # spent 20µs making 1 call to Exporter::import
132665µs226.0ms
# spent 25.9ms (1.04+24.9) within MARC::Charset::BEGIN@13 which was called: # once (1.04ms+24.9ms) by MARC::File::XML::BEGIN@11 at line 13
use charnames ':full';
# spent 25.9ms making 1 call to MARC::Charset::BEGIN@13 # spent 144µs making 1 call to charnames::import
1421.07ms113.1ms
# spent 13.1ms (1.59+11.6) within MARC::Charset::BEGIN@14 which was called: # once (1.59ms+11.6ms) by MARC::File::XML::BEGIN@11 at line 14
use MARC::Charset::Table;
# spent 13.1ms making 1 call to MARC::Charset::BEGIN@14
1521.79ms2332µs
# spent 170µs (8+162) within MARC::Charset::BEGIN@15 which was called: # once (8µs+162µs) by MARC::File::XML::BEGIN@11 at line 15
use MARC::Charset::Constants qw(:all);
# spent 170µs making 1 call to MARC::Charset::BEGIN@15 # spent 162µs making 1 call to Exporter::import
16
17=head1 NAME
18
19MARC::Charset - convert MARC-8 encoded strings to UTF-8
20
21=head1 SYNOPSIS
22
23 # import the marc8_to_utf8 function
24 use MARC::Charset 'marc8_to_utf8';
25
26 # prepare STDOUT for utf8
27 binmode(STDOUT, 'utf8');
28
29 # print out some marc8 as utf8
30 print marc8_to_utf8($marc8_string);
31
32=head1 DESCRIPTION
33
34MARC::Charset allows you to turn MARC-8 encoded strings into UTF-8
35strings. MARC-8 is a single byte character encoding that predates unicode, and
36allows you to put non-Roman scripts in MARC bibliographic records.
37
38 http://www.loc.gov/marc/specifications/spechome.html
39
40=head1 EXPORTS
41
42=cut
43
44# get the mapping table
4512µs169µsour $table = MARC::Charset::Table->new();
# spent 69µs making 1 call to MARC::Charset::Table::new
46
47# set default character sets
48# these are viewable at the package level
49# in case someone wants to set them
501300nsour $DEFAULT_G0 = ASCII_DEFAULT;
511100nsour $DEFAULT_G1 = EXTENDED_LATIN;
52
53123µsour %SPECIAL_DECOMPOSABLE = (
54 chr(0x01a0) => chr(0x01a0), # uppercase o-hook
55 chr(0x01af) => chr(0x01af), # uppercase u-hook
56 chr(0x01a1) => chr(0x01a1), # lowercase o-hook
57 chr(0x01b0) => chr(0x01b0), # lowercase u-hook
58 chr(0x1ef1) => chr(0x01b0) . chr(0x0323), # lowercase u-hook with dot below
59 chr(0x1ee9) => chr(0x01b0) . chr(0x0301), # lowercase u-hook with acute
60 # Arabic to not decompose
61 chr(0x0622) => chr(0x0622),
62 chr(0x0623) => chr(0x0623),
63 chr(0x0624) => chr(0x0624),
64 chr(0x0625) => chr(0x0625),
65 chr(0x0626) => chr(0x0626),
66 chr(0x0649) => chr(0x0649),
67 chr(0x0671) => chr(0x0671),
68 chr(0x06c0) => chr(0x06c0),
69 chr(0x06D3) => chr(0x06D3),
70 # Cyrillic to not decompose
71 chr(0x0439) => chr(0x0439),
72 chr(0x0419) => chr(0x0419),
73 chr(0x0453) => chr(0x0453),
74 chr(0x0451) => chr(0x0451),
75 chr(0x0457) => chr(0x0457),
76 chr(0x045C) => chr(0x045C),
77 chr(0x045E) => chr(0x045E),
78 chr(0x0403) => chr(0x0403),
79 chr(0x0401) => chr(0x0401),
80 chr(0x0407) => chr(0x0407),
81 chr(0x040C) => chr(0x040C),
82 chr(0x040E) => chr(0x040E),
83 # Katakana to not decompose
84 chr(0x309B) => chr(0x309B),
85 chr(0x309C) => chr(0x309C),
86 chr(0x30AC) => chr(0x30AC),
87 chr(0x30AE) => chr(0x30AE),
88 chr(0x30B0) => chr(0x30B0),
89 chr(0x30B2) => chr(0x30B2),
90 chr(0x30B4) => chr(0x30B4),
91 chr(0x30B6) => chr(0x30B6),
92 chr(0x30B8) => chr(0x30B8),
93 chr(0x30BA) => chr(0x30BA),
94 chr(0x30BC) => chr(0x30BC),
95 chr(0x30BE) => chr(0x30BE),
96 chr(0x30C0) => chr(0x30C0),
97 chr(0x30C2) => chr(0x30C2),
98 chr(0x30C5) => chr(0x30C5),
99 chr(0x30C7) => chr(0x30C7),
100 chr(0x30C9) => chr(0x30C9),
101 chr(0x30D0) => chr(0x30D0),
102 chr(0x30D1) => chr(0x30D1),
103 chr(0x30D3) => chr(0x30D3),
104 chr(0x30D4) => chr(0x30D4),
105 chr(0x30D6) => chr(0x30D6),
106 chr(0x30D7) => chr(0x30D7),
107 chr(0x30D9) => chr(0x30D9),
108 chr(0x30DA) => chr(0x30DA),
109 chr(0x30DC) => chr(0x30DC),
110 chr(0x30DD) => chr(0x30DD),
111 chr(0x30F4) => chr(0x30F4),
112 chr(0x30F7) => chr(0x30F7),
113 chr(0x30F8) => chr(0x30F8),
114 chr(0x30F9) => chr(0x30F9),
115 chr(0x30FA) => chr(0x30FA),
116 chr(0x30FE) => chr(0x30FE),
117 chr(0x30FF) => chr(0x30FF),
118);
119
120=head2 ignore_errors()
121
122Tells MARC::Charset whether or not to ignore all encoding errors, and
123returns the current setting. This is helpful if you have records that
124contain both MARC8 and UNICODE characters.
125
126 my $ignore = MARC::Charset->ignore_errors();
127
128 MARC::Charset->ignore_errors(1); # ignore errors
129 MARC::Charset->ignore_errors(0); # DO NOT ignore errors
130
131=cut
132
133
1341200nsour $_ignore_errors = 0;
135sub ignore_errors {
136 my ($self,$i) = @_;
137 $_ignore_errors = $i if (defined($i));
138 return $_ignore_errors;
139}
140
141
142=head2 assume_unicode()
143
144Tells MARC::Charset whether or not to assume UNICODE when an error is
145encountered in ignore_errors mode and returns the current setting.
146This is helpful if you have records that contain both MARC8 and UNICODE
147characters.
148
149 my $setting = MARC::Charset->assume_unicode();
150
151 MARC::Charset->assume_unicode(1); # assume characters are unicode (utf-8)
152 MARC::Charset->assume_unicode(0); # DO NOT assume characters are unicode
153
154=cut
155
156
1571100nsour $_assume = '';
158sub assume_unicode {
159 my ($self,$i) = @_;
160 $_assume = 'utf8' if (defined($i) and $i);
161 return 1 if ($_assume eq 'utf8');
162}
163
164
165=head2 assume_encoding()
166
167Tells MARC::Charset whether or not to assume a specific encoding when an error
168is encountered in ignore_errors mode and returns the current setting. This
169is helpful if you have records that contain both MARC8 and other characters.
170
171 my $setting = MARC::Charset->assume_encoding();
172
173 MARC::Charset->assume_encoding('cp850'); # assume characters are cp850
174 MARC::Charset->assume_encoding(''); # DO NOT assume any encoding
175
176=cut
177
178
179sub assume_encoding {
180 my ($self,$i) = @_;
181 $_assume = $i if (defined($i));
182 return $_assume;
183}
184
185
186# place holders for working graphical character sets
1871100nsmy $G0;
188my $G1;
189
190=head2 marc8_to_utf8()
191
192Converts a MARC-8 encoded string to UTF-8.
193
194 my $utf8 = marc8_to_utf8($marc8);
195
196If you'd like to ignore errors pass in a true value as the 2nd
197parameter or call MARC::Charset->ignore_errors() with a true
198value:
199
200 my $utf8 = marc8_to_utf8($marc8, 'ignore-errors');
201
202 or
203
204 MARC::Charset->ignore_errors(1);
205 my $utf8 = marc8_to_utf8($marc8);
206
207=cut
208
209
210sub marc8_to_utf8
211{
212 my ($marc8, $ignore_errors) = @_;
213 reset_charsets();
214
215 $ignore_errors = $_ignore_errors if (!defined($ignore_errors));
216
217 # holder for our utf8
218 my $utf8 = '';
219
220 my $index = 0;
221 my $length = length($marc8);
222 my $combining = '';
223 CHAR_LOOP: while ($index < $length)
224 {
225 # whitespace, line feeds and carriage returns just get added on unmolested
226 if (substr($marc8, $index, 1) =~ m/(\s+|\x0A+|\x0D+)/so)
227 {
228 $utf8 .= $1;
229 $index += 1;
230 next CHAR_LOOP;
231 }
232
233 # look for any escape sequences
234 my $new_index = _process_escape(\$marc8, $index, $length);
235 if ($new_index > $index)
236 {
237 $index = $new_index;
238 next CHAR_LOOP;
239 }
240
241 my $found;
242 CHARSET_LOOP: foreach my $charset ($G0, $G1)
243 {
244
245 # cjk characters are a string of three chars
246 my $char_size = $charset eq CJK ? 3 : 1;
247
248 # extract the next code point to examine
249 my $chunk = substr($marc8, $index, $char_size);
250
251 my $code;
252 if ($char_size == 1) {
253 my $codepoint = ord($chunk);
254 if ($codepoint >= 0x21 && $codepoint <= 0x7e) {
255 # character is G0
256 $code = $table->lookup_by_marc8($G0, $chunk);
257 } elsif ($codepoint >= 0xa1 && $codepoint <= 0xfe) {
258 # character is G1, map it to G0 before atttempting lookup
259 $code = $table->lookup_by_marc8($G1, chr($codepoint - 128));
260 } elsif ($codepoint >= 0x88 && $codepoint <= 0x8e) {
261 # in the C1 range used by MARC8
262 $code = $table->lookup_by_marc8(EXTENDED_LATIN, $chunk);
263 } elsif ($codepoint >= 0x1b && $codepoint <= 0x1f) {
264 # in the C0 range used by MARC8
265 $code = $table->lookup_by_marc8(BASIC_LATIN, $chunk);
266 }
267 } else {
268 # EACC doesn't need G0/G1 conversion
269 $code = $table->lookup_by_marc8($charset, $chunk);
270 }
271
272 # try the next character set if no mapping was found
273 next CHARSET_LOOP if ! $code;
274 $found = 1;
275
276 # gobble up all combining characters for appending later
277 # this is necessary because combinging characters precede
278 # the character they modify in MARC-8, whereas they follow
279 # the character they modify in UTF-8.
280 if ($code->is_combining())
281 {
282 # If the current character is the right half of a MARC-8
283 # ligature or double tilde, we don't want to include
284 # it in the UTF-8 output. For the explanation, see
285 # http://lcweb2.loc.gov/diglib/codetables/45.html#Note1
286 # Note that if the MARC-8 string includes a right half
287 # without the corresponding left half, the right half will
288 # get dropped instead of being mapped to its UCS alternate.
289 # That's OK since including only one half of a double diacritic
290 # was presumably a mistake to begin with.
291 unless (defined $code->marc_left_half())
292 {
293 $combining .= $code->char_value();
294 }
295 }
296 else
297 {
298 $utf8 .= $code->char_value() . $combining;
299 $combining = '';
300 }
301
302 $index += $char_size;
303 next CHAR_LOOP;
304 }
305
306 if (!$found)
307 {
308 warn(sprintf("no mapping found for [0x\%X] at position $index in $marc8 ".
309 "g0=".MARC::Charset::Constants::charset_name($G0) . " " .
310 "g1=".MARC::Charset::Constants::charset_name($G1), unpack('C',substr($marc8,$index,1))));
311 if (!$ignore_errors)
312 {
313 reset_charsets();
314 return;
315 }
316 if ($_assume)
317 {
318 reset_charsets();
319 return NFC(decode($_assume => $marc8));
320 }
321 $index += 1;
322 }
323
324 }
325
326 # return the utf8
327 reset_charsets();
328 utf8::upgrade($utf8);
329 return $utf8;
330}
331
- -
334=head2 utf8_to_marc8()
335
336Will attempt to translate utf8 into marc8.
337
338 my $marc8 = utf8_to_marc8($utf8);
339
340If you'd like to ignore errors, or characters that can't be
341converted to marc8 then pass in a true value as the second
342parameter:
343
344 my $marc8 = utf8_to_marc8($utf8, 'ignore-errors');
345
346 or
347
348 MARC::Charset->ignore_errors(1);
349 my $utf8 = marc8_to_utf8($marc8);
350
351=cut
352
353sub utf8_to_marc8
354{
355 my ($utf8, $ignore_errors) = @_;
356 reset_charsets();
357
358 $ignore_errors = $_ignore_errors if (!defined($ignore_errors));
359
360 # decompose combined characters
361 $utf8 = join('',
362 map { exists $SPECIAL_DECOMPOSABLE{$_} ? $SPECIAL_DECOMPOSABLE{$_} : NFD($_) }
363 split //, $utf8
364 );
365
366 my $len = length($utf8);
367 my $marc8 = '';
368 for (my $i=0; $i<$len; $i++)
369 {
370 my $slice = substr($utf8, $i, 1);
371
372 # spaces are copied from utf8 into marc8
373 if ($slice eq ' ')
374 {
375 $marc8 .= ' ';
376 next;
377 }
378
379 # try to find the code point in our mapping table
380 my $code = $table->lookup_by_utf8($slice);
381
382 if (! $code)
383 {
384 warn("no mapping found at position $i in $utf8");
385 reset_charsets() and return unless $ignore_errors;
386 }
387
388 # if it's a combining character move it around
389 if ($code->is_combining())
390 {
391 my $prev = chop($marc8);
392 if ($code->marc_left_half())
393 {
394 # don't add the MARC-8 right half character
395 # if it was already inserted when the double
396 # diacritic was converted from UTF-8
397 if ($code->marc_value() eq substr($marc8, -1, 1))
398 {
399 $marc8 .= $prev;
400 next;
401 }
402 }
403 $marc8 .= $code->marc_value() . $prev;
404 if ($code->marc_right_half())
405 {
406 $marc8 .= chr(hex($code->marc_right_half()));
407 }
408 next;
409 }
410
411 # look to see if we need to escape to a new G0 charset
412 my $charset_value = $code->charset_value();
413
414 if ($code->default_charset_group() eq 'G0'
415 and $G0 ne $charset_value)
416 {
417 if ($G0 eq ASCII_DEFAULT and $charset_value eq BASIC_LATIN)
418 {
419 # don't bother escaping, they're functionally the same
420 }
421 else
422 {
423 $marc8 .= $code->get_escape();
424 $G0 = $charset_value;
425 }
426 }
427
428 # look to see if we need to escape to a new G1 charset
429 elsif ($code->default_charset_group() eq 'G1'
430 and $G1 ne $charset_value)
431 {
432 $marc8 .= $code->get_escape();
433 $G1 = $charset_value;
434 }
435
436 $marc8 .= $code->marc_value();
437 }
438
439 # escape back to default G0 if necessary
440 if ($G0 ne $DEFAULT_G0)
441 {
442 if ($DEFAULT_G0 eq ASCII_DEFAULT) { $marc8 .= ESCAPE . ASCII_DEFAULT; }
443 elsif ($DEFAULT_G0 eq CJK) { $marc8 .= ESCAPE . MULTI_G0_A . CJK; }
444 else { $marc8 .= ESCAPE . SINGLE_G0_A . $DEFAULT_G0; }
445 }
446
447 # escape back to default G1 if necessary
448 if ($G1 ne $DEFAULT_G1)
449 {
450 if ($DEFAULT_G1 eq CJK) { $marc8 .= ESCAPE . MULTI_G1_A . $DEFAULT_G1; }
451 else { $marc8 .= ESCAPE . SINGLE_G1_A . $DEFAULT_G1; }
452 }
453
454 return $marc8;
455}
456
- -
459=head1 DEFAULT CHARACTER SETS
460
461If you need to alter the default character sets you can set the
462$MARC::Charset::DEFAULT_G0 and $MARC::Charset::DEFAULT_G1 variables to the
463appropriate character set code:
464
465 use MARC::Charset::Constants qw(:all);
466 $MARC::Charset::DEFAULT_G0 = BASIC_ARABIC;
467 $MARC::Charset::DEFAULT_G1 = EXTENDED_ARABIC;
468
469=head1 SEE ALSO
470
471=over 4
472
473=item * L<MARC::Charset::Constant>
474
475=item * L<MARC::Charset::Table>
476
477=item * L<MARC::Charset::Code>
478
479=item * L<MARC::Charset::Compiler>
480
481=item * L<MARC::Record>
482
483=item * L<MARC::XML>
484
485=back
486
487=head1 AUTHOR
488
489Ed Summers (ehs@pobox.com)
490
491=cut
492
493
494sub _process_escape
495{
496 ## this stuff is kind of scary ... for an explanation of what is
497 ## going on here check out the MARC-8 specs at LC.
498 ## http://lcweb.loc.gov/marc/specifications/speccharmarc8.html
499 my ($str_ref, $left, $right) = @_;
500
501 # first char needs to be an escape or else this isn't an escape sequence
502 return $left unless substr($$str_ref, $left, 1) eq ESCAPE;
503
504 ## if we don't have at least one character after the escape
505 ## then this can't be a character escape sequence
506 return $left if ($left+1 >= $right);
507
508 ## pull off the first escape
509 my $esc_char_1 = substr($$str_ref, $left+1, 1);
510
511 ## the first method of escaping to small character sets
512 if ( $esc_char_1 eq GREEK_SYMBOLS
513 or $esc_char_1 eq SUBSCRIPTS
514 or $esc_char_1 eq SUPERSCRIPTS
515 or $esc_char_1 eq ASCII_DEFAULT)
516 {
517 $G0 = $esc_char_1;
518 return $left+2;
519 }
520
521 ## the second more complicated method of escaping to bigger charsets
522 return $left if $left+2 >= $right;
523
524 my $esc_char_2 = substr($$str_ref, $left+2, 1);
525 my $esc_chars = $esc_char_1 . $esc_char_2;
526
527 if ($esc_char_1 eq SINGLE_G0_A
528 or $esc_char_1 eq SINGLE_G0_B)
529 {
530 $G0 = $esc_char_2;
531 return $left+3;
532 }
533
534 elsif ($esc_char_1 eq SINGLE_G1_A
535 or $esc_char_1 eq SINGLE_G1_B)
536 {
537 $G1 = $esc_char_2;
538 return $left+3;
539 }
540
541 elsif ( $esc_char_1 eq MULTI_G0_A ) {
542 $G0 = $esc_char_2;
543 return $left+3;
544 }
545
546 elsif ($esc_chars eq MULTI_G0_B
547 and ($left+3 < $right))
548 {
549 $G0 = substr($$str_ref, $left+3, 1);
550 return $left+4;
551 }
552
553 elsif (($esc_chars eq MULTI_G1_A or $esc_chars eq MULTI_G1_B)
554 and ($left + 3 < $right))
555 {
556 $G1 = substr($$str_ref, $left+3, 1);
557 return $left+4;
558 }
559
560 # we should never get here
561 warn("seem to have fallen through in _process_escape()");
562 return $left;
563}
564
565sub reset_charsets
566{
567 $G0 = $DEFAULT_G0;
568 $G1 = $DEFAULT_G1;
569}
570
571110µs1;