Profile of MARC/Charset.pm

Filename	/usr/share/perl5/MARC/Charset.pm
Statements	Executed 26 statements in 4.42ms

Subroutines
Calls	P	F	Exclusive Time	Inclusive Time	Subroutine
1	1	1	1.59ms	13.1ms	MARC::Charset::::BEGIN@14MARC::Charset::BEGIN@14
1	1	1	1.40ms	2.31ms	MARC::Charset::::BEGIN@11MARC::Charset::BEGIN@11
1	1	1	1.04ms	25.9ms	MARC::Charset::::BEGIN@13MARC::Charset::BEGIN@13
1	1	1	11µs	22µs	MARC::Charset::::BEGIN@3MARC::Charset::BEGIN@3
1	1	1	8µs	28µs	MARC::Charset::::BEGIN@12MARC::Charset::BEGIN@12
1	1	1	8µs	170µs	MARC::Charset::::BEGIN@15MARC::Charset::BEGIN@15
1	1	1	7µs	12µs	MARC::Charset::::BEGIN@4MARC::Charset::BEGIN@4
1	1	1	7µs	64µs	MARC::Charset::::BEGIN@8MARC::Charset::BEGIN@8
0	0	0	0s	0s	MARC::Charset::::_process_escapeMARC::Charset::_process_escape
0	0	0	0s	0s	MARC::Charset::::assume_encodingMARC::Charset::assume_encoding
0	0	0	0s	0s	MARC::Charset::::assume_unicodeMARC::Charset::assume_unicode
0	0	0	0s	0s	MARC::Charset::::ignore_errorsMARC::Charset::ignore_errors
0	0	0	0s	0s	MARC::Charset::::marc8_to_utf8MARC::Charset::marc8_to_utf8
0	0	0	0s	0s	MARC::Charset::::reset_charsetsMARC::Charset::reset_charsets
0	0	0	0s	0s	MARC::Charset::::utf8_to_marc8MARC::Charset::utf8_to_marc8

Call graph for these subroutines as a Graphviz dot language file.

Line	State ments	Time on line	Calls	Time in subs	Code
1					package MARC::Charset;
2
3	2	22µs	2	34µs	# spent 22µs (11+12) within MARC::Charset::BEGIN@3 which was called: # once (11µs+12µs) by MARC::File::XML::BEGIN@11 at line 3 use strict; # spent 22µs making 1 call to MARC::Charset::BEGIN@3 # spent 12µs making 1 call to strict::import
4	2	31µs	2	16µs	# spent 12µs (7+5) within MARC::Charset::BEGIN@4 which was called: # once (7µs+5µs) by MARC::File::XML::BEGIN@11 at line 4 use warnings; # spent 12µs making 1 call to MARC::Charset::BEGIN@4 # spent 5µs making 1 call to warnings::import
5
6	1	700ns			our $VERSION = '1.35';
7
8	2	33µs	2	122µs	# spent 64µs (7+58) within MARC::Charset::BEGIN@8 which was called: # once (7µs+58µs) by MARC::File::XML::BEGIN@11 at line 8 use base qw(Exporter); # spent 64µs making 1 call to MARC::Charset::BEGIN@8 # spent 58µs making 1 call to base::import
9	1	1µs			our @EXPORT_OK = qw(marc8_to_utf8 utf8_to_marc8);
10
11	2	755µs	2	2.34ms	# spent 2.31ms (1.40+902µs) within MARC::Charset::BEGIN@11 which was called: # once (1.40ms+902µs) by MARC::File::XML::BEGIN@11 at line 11 use Unicode::Normalize; # spent 2.31ms making 1 call to MARC::Charset::BEGIN@11 # spent 28µs making 1 call to Exporter::import
12	2	24µs	2	47µs	# spent 28µs (8+20) within MARC::Charset::BEGIN@12 which was called: # once (8µs+20µs) by MARC::File::XML::BEGIN@11 at line 12 use Encode 'decode'; # spent 28µs making 1 call to MARC::Charset::BEGIN@12 # spent 20µs making 1 call to Exporter::import
13	2	665µs	2	26.0ms	# spent 25.9ms (1.04+24.9) within MARC::Charset::BEGIN@13 which was called: # once (1.04ms+24.9ms) by MARC::File::XML::BEGIN@11 at line 13 use charnames ':full'; # spent 25.9ms making 1 call to MARC::Charset::BEGIN@13 # spent 144µs making 1 call to charnames::import
14	2	1.07ms	1	13.1ms	# spent 13.1ms (1.59+11.6) within MARC::Charset::BEGIN@14 which was called: # once (1.59ms+11.6ms) by MARC::File::XML::BEGIN@11 at line 14 use MARC::Charset::Table; # spent 13.1ms making 1 call to MARC::Charset::BEGIN@14
15	2	1.79ms	2	332µs	# spent 170µs (8+162) within MARC::Charset::BEGIN@15 which was called: # once (8µs+162µs) by MARC::File::XML::BEGIN@11 at line 15 use MARC::Charset::Constants qw(:all); # spent 170µs making 1 call to MARC::Charset::BEGIN@15 # spent 162µs making 1 call to Exporter::import
16
17					=head1 NAME
18
19					MARC::Charset - convert MARC-8 encoded strings to UTF-8
20
21					=head1 SYNOPSIS
22
23					# import the marc8_to_utf8 function
24					use MARC::Charset 'marc8_to_utf8';
25
26					# prepare STDOUT for utf8
27					binmode(STDOUT, 'utf8');
28
29					# print out some marc8 as utf8
30					print marc8_to_utf8($marc8_string);
31
32					=head1 DESCRIPTION
33
34					MARC::Charset allows you to turn MARC-8 encoded strings into UTF-8
35					strings. MARC-8 is a single byte character encoding that predates unicode, and
36					allows you to put non-Roman scripts in MARC bibliographic records.
37
38					http://www.loc.gov/marc/specifications/spechome.html
39
40					=head1 EXPORTS
41
42					=cut
43
44					# get the mapping table
45	1	2µs	1	69µs	our $table = MARC::Charset::Table->new(); # spent 69µs making 1 call to MARC::Charset::Table::new
46
47					# set default character sets
48					# these are viewable at the package level
49					# in case someone wants to set them
50	1	300ns			our $DEFAULT_G0 = ASCII_DEFAULT;
51	1	100ns			our $DEFAULT_G1 = EXTENDED_LATIN;
52
53	1	23µs			our %SPECIAL_DECOMPOSABLE = (
54					chr(0x01a0) => chr(0x01a0), # uppercase o-hook
55					chr(0x01af) => chr(0x01af), # uppercase u-hook
56					chr(0x01a1) => chr(0x01a1), # lowercase o-hook
57					chr(0x01b0) => chr(0x01b0), # lowercase u-hook
58					chr(0x1ef1) => chr(0x01b0) . chr(0x0323), # lowercase u-hook with dot below
59					chr(0x1ee9) => chr(0x01b0) . chr(0x0301), # lowercase u-hook with acute
60					# Arabic to not decompose
61					chr(0x0622) => chr(0x0622),
62					chr(0x0623) => chr(0x0623),
63					chr(0x0624) => chr(0x0624),
64					chr(0x0625) => chr(0x0625),
65					chr(0x0626) => chr(0x0626),
66					chr(0x0649) => chr(0x0649),
67					chr(0x0671) => chr(0x0671),
68					chr(0x06c0) => chr(0x06c0),
69					chr(0x06D3) => chr(0x06D3),
70					# Cyrillic to not decompose
71					chr(0x0439) => chr(0x0439),
72					chr(0x0419) => chr(0x0419),
73					chr(0x0453) => chr(0x0453),
74					chr(0x0451) => chr(0x0451),
75					chr(0x0457) => chr(0x0457),
76					chr(0x045C) => chr(0x045C),
77					chr(0x045E) => chr(0x045E),
78					chr(0x0403) => chr(0x0403),
79					chr(0x0401) => chr(0x0401),
80					chr(0x0407) => chr(0x0407),
81					chr(0x040C) => chr(0x040C),
82					chr(0x040E) => chr(0x040E),
83					# Katakana to not decompose
84					chr(0x309B) => chr(0x309B),
85					chr(0x309C) => chr(0x309C),
86					chr(0x30AC) => chr(0x30AC),
87					chr(0x30AE) => chr(0x30AE),
88					chr(0x30B0) => chr(0x30B0),
89					chr(0x30B2) => chr(0x30B2),
90					chr(0x30B4) => chr(0x30B4),
91					chr(0x30B6) => chr(0x30B6),
92					chr(0x30B8) => chr(0x30B8),
93					chr(0x30BA) => chr(0x30BA),
94					chr(0x30BC) => chr(0x30BC),
95					chr(0x30BE) => chr(0x30BE),
96					chr(0x30C0) => chr(0x30C0),
97					chr(0x30C2) => chr(0x30C2),
98					chr(0x30C5) => chr(0x30C5),
99					chr(0x30C7) => chr(0x30C7),
100					chr(0x30C9) => chr(0x30C9),
101					chr(0x30D0) => chr(0x30D0),
102					chr(0x30D1) => chr(0x30D1),
103					chr(0x30D3) => chr(0x30D3),
104					chr(0x30D4) => chr(0x30D4),
105					chr(0x30D6) => chr(0x30D6),
106					chr(0x30D7) => chr(0x30D7),
107					chr(0x30D9) => chr(0x30D9),
108					chr(0x30DA) => chr(0x30DA),
109					chr(0x30DC) => chr(0x30DC),
110					chr(0x30DD) => chr(0x30DD),
111					chr(0x30F4) => chr(0x30F4),
112					chr(0x30F7) => chr(0x30F7),
113					chr(0x30F8) => chr(0x30F8),
114					chr(0x30F9) => chr(0x30F9),
115					chr(0x30FA) => chr(0x30FA),
116					chr(0x30FE) => chr(0x30FE),
117					chr(0x30FF) => chr(0x30FF),
118					);
119
120					=head2 ignore_errors()
121
122					Tells MARC::Charset whether or not to ignore all encoding errors, and
123					returns the current setting. This is helpful if you have records that
124					contain both MARC8 and UNICODE characters.
125
126					my $ignore = MARC::Charset->ignore_errors();
127
128					MARC::Charset->ignore_errors(1); # ignore errors
129					MARC::Charset->ignore_errors(0); # DO NOT ignore errors
130
131					=cut
132
133
134	1	200ns			our $_ignore_errors = 0;
135					sub ignore_errors {
136					my ($self,$i) = @_;
137					$_ignore_errors = $i if (defined($i));
138					return $_ignore_errors;
139					}
140
141
142					=head2 assume_unicode()
143
144					Tells MARC::Charset whether or not to assume UNICODE when an error is
145					encountered in ignore_errors mode and returns the current setting.
146					This is helpful if you have records that contain both MARC8 and UNICODE
147					characters.
148
149					my $setting = MARC::Charset->assume_unicode();
150
151					MARC::Charset->assume_unicode(1); # assume characters are unicode (utf-8)
152					MARC::Charset->assume_unicode(0); # DO NOT assume characters are unicode
153
154					=cut
155
156
157	1	100ns			our $_assume = '';
158					sub assume_unicode {
159					my ($self,$i) = @_;
160					$_assume = 'utf8' if (defined($i) and $i);
161					return 1 if ($_assume eq 'utf8');
162					}
163
164
165					=head2 assume_encoding()
166
167					Tells MARC::Charset whether or not to assume a specific encoding when an error
168					is encountered in ignore_errors mode and returns the current setting. This
169					is helpful if you have records that contain both MARC8 and other characters.
170
171					my $setting = MARC::Charset->assume_encoding();
172
173					MARC::Charset->assume_encoding('cp850'); # assume characters are cp850
174					MARC::Charset->assume_encoding(''); # DO NOT assume any encoding
175
176					=cut
177
178
179					sub assume_encoding {
180					my ($self,$i) = @_;
181					$_assume = $i if (defined($i));
182					return $_assume;
183					}
184
185
186					# place holders for working graphical character sets
187	1	100ns			my $G0;
188					my $G1;
189
190					=head2 marc8_to_utf8()
191
192					Converts a MARC-8 encoded string to UTF-8.
193
194					my $utf8 = marc8_to_utf8($marc8);
195
196					If you'd like to ignore errors pass in a true value as the 2nd
197					parameter or call MARC::Charset->ignore_errors() with a true
198					value:
199
200					my $utf8 = marc8_to_utf8($marc8, 'ignore-errors');
201
202					or
203
204					MARC::Charset->ignore_errors(1);
205					my $utf8 = marc8_to_utf8($marc8);
206
207					=cut
208
209
210					sub marc8_to_utf8
211					{
212					my ($marc8, $ignore_errors) = @_;
213					reset_charsets();
214
215					$ignore_errors = $_ignore_errors if (!defined($ignore_errors));
216
217					# holder for our utf8
218					my $utf8 = '';
219
220					my $index = 0;
221					my $length = length($marc8);
222					my $combining = '';
223					CHAR_LOOP: while ($index < $length)
224					{
225					# whitespace, line feeds and carriage returns just get added on unmolested
226					if (substr($marc8, $index, 1) =~ m/(\s+\|\x0A+\|\x0D+)/so)
227					{
228					$utf8 .= $1;
229					$index += 1;
230					next CHAR_LOOP;
231					}
232
233					# look for any escape sequences
234					my $new_index = _process_escape(\$marc8, $index, $length);
235					if ($new_index > $index)
236					{
237					$index = $new_index;
238					next CHAR_LOOP;
239					}
240
241					my $found;
242					CHARSET_LOOP: foreach my $charset ($G0, $G1)
243					{
244
245					# cjk characters are a string of three chars
246					my $char_size = $charset eq CJK ? 3 : 1;
247
248					# extract the next code point to examine
249					my $chunk = substr($marc8, $index, $char_size);
250
251					my $code;
252					if ($char_size == 1) {
253					my $codepoint = ord($chunk);
254					if ($codepoint >= 0x21 && $codepoint <= 0x7e) {
255					# character is G0
256					$code = $table->lookup_by_marc8($G0, $chunk);
257					} elsif ($codepoint >= 0xa1 && $codepoint <= 0xfe) {
258					# character is G1, map it to G0 before atttempting lookup
259					$code = $table->lookup_by_marc8($G1, chr($codepoint - 128));
260					} elsif ($codepoint >= 0x88 && $codepoint <= 0x8e) {
261					# in the C1 range used by MARC8
262					$code = $table->lookup_by_marc8(EXTENDED_LATIN, $chunk);
263					} elsif ($codepoint >= 0x1b && $codepoint <= 0x1f) {
264					# in the C0 range used by MARC8
265					$code = $table->lookup_by_marc8(BASIC_LATIN, $chunk);
266					}
267					} else {
268					# EACC doesn't need G0/G1 conversion
269					$code = $table->lookup_by_marc8($charset, $chunk);
270					}
271
272					# try the next character set if no mapping was found
273					next CHARSET_LOOP if ! $code;
274					$found = 1;
275
276					# gobble up all combining characters for appending later
277					# this is necessary because combinging characters precede
278					# the character they modify in MARC-8, whereas they follow
279					# the character they modify in UTF-8.
280					if ($code->is_combining())
281					{
282					# If the current character is the right half of a MARC-8
283					# ligature or double tilde, we don't want to include
284					# it in the UTF-8 output. For the explanation, see
285					# http://lcweb2.loc.gov/diglib/codetables/45.html#Note1
286					# Note that if the MARC-8 string includes a right half
287					# without the corresponding left half, the right half will
288					# get dropped instead of being mapped to its UCS alternate.
289					# That's OK since including only one half of a double diacritic
290					# was presumably a mistake to begin with.
291					unless (defined $code->marc_left_half())
292					{
293					$combining .= $code->char_value();
294					}
295					}
296					else
297					{
298					$utf8 .= $code->char_value() . $combining;
299					$combining = '';
300					}
301
302					$index += $char_size;
303					next CHAR_LOOP;
304					}
305
306					if (!$found)
307					{
308					warn(sprintf("no mapping found for [0x\%X] at position $index in $marc8 ".
309					"g0=".MARC::Charset::Constants::charset_name($G0) . " " .
310					"g1=".MARC::Charset::Constants::charset_name($G1), unpack('C',substr($marc8,$index,1))));
311					if (!$ignore_errors)
312					{
313					reset_charsets();
314					return;
315					}
316					if ($_assume)
317					{
318					reset_charsets();
319					return NFC(decode($_assume => $marc8));
320					}
321					$index += 1;
322					}
323
324					}
325
326					# return the utf8
327					reset_charsets();
328					utf8::upgrade($utf8);
329					return $utf8;
330					}
331
- -
334					=head2 utf8_to_marc8()
335
336					Will attempt to translate utf8 into marc8.
337
338					my $marc8 = utf8_to_marc8($utf8);
339
340					If you'd like to ignore errors, or characters that can't be
341					converted to marc8 then pass in a true value as the second
342					parameter:
343
344					my $marc8 = utf8_to_marc8($utf8, 'ignore-errors');
345
346					or
347
348					MARC::Charset->ignore_errors(1);
349					my $utf8 = marc8_to_utf8($marc8);
350
351					=cut
352
353					sub utf8_to_marc8
354					{
355					my ($utf8, $ignore_errors) = @_;
356					reset_charsets();
357
358					$ignore_errors = $_ignore_errors if (!defined($ignore_errors));
359
360					# decompose combined characters
361					$utf8 = join('',
362					map { exists $SPECIAL_DECOMPOSABLE{$_} ? $SPECIAL_DECOMPOSABLE{$_} : NFD($_) }
363					split //, $utf8
364					);
365
366					my $len = length($utf8);
367					my $marc8 = '';
368					for (my $i=0; $i<$len; $i++)
369					{
370					my $slice = substr($utf8, $i, 1);
371
372					# spaces are copied from utf8 into marc8
373					if ($slice eq ' ')
374					{
375					$marc8 .= ' ';
376					next;
377					}
378
379					# try to find the code point in our mapping table
380					my $code = $table->lookup_by_utf8($slice);
381
382					if (! $code)
383					{
384					warn("no mapping found at position $i in $utf8");
385					reset_charsets() and return unless $ignore_errors;
386					}
387
388					# if it's a combining character move it around
389					if ($code->is_combining())
390					{
391					my $prev = chop($marc8);
392					if ($code->marc_left_half())
393					{
394					# don't add the MARC-8 right half character
395					# if it was already inserted when the double
396					# diacritic was converted from UTF-8
397					if ($code->marc_value() eq substr($marc8, -1, 1))
398					{
399					$marc8 .= $prev;
400					next;
401					}
402					}
403					$marc8 .= $code->marc_value() . $prev;
404					if ($code->marc_right_half())
405					{
406					$marc8 .= chr(hex($code->marc_right_half()));
407					}
408					next;
409					}
410
411					# look to see if we need to escape to a new G0 charset
412					my $charset_value = $code->charset_value();
413
414					if ($code->default_charset_group() eq 'G0'
415					and $G0 ne $charset_value)
416					{
417					if ($G0 eq ASCII_DEFAULT and $charset_value eq BASIC_LATIN)
418					{
419					# don't bother escaping, they're functionally the same
420					}
421					else
422					{
423					$marc8 .= $code->get_escape();
424					$G0 = $charset_value;
425					}
426					}
427
428					# look to see if we need to escape to a new G1 charset
429					elsif ($code->default_charset_group() eq 'G1'
430					and $G1 ne $charset_value)
431					{
432					$marc8 .= $code->get_escape();
433					$G1 = $charset_value;
434					}
435
436					$marc8 .= $code->marc_value();
437					}
438
439					# escape back to default G0 if necessary
440					if ($G0 ne $DEFAULT_G0)
441					{
442					if ($DEFAULT_G0 eq ASCII_DEFAULT) { $marc8 .= ESCAPE . ASCII_DEFAULT; }
443					elsif ($DEFAULT_G0 eq CJK) { $marc8 .= ESCAPE . MULTI_G0_A . CJK; }
444					else { $marc8 .= ESCAPE . SINGLE_G0_A . $DEFAULT_G0; }
445					}
446
447					# escape back to default G1 if necessary
448					if ($G1 ne $DEFAULT_G1)
449					{
450					if ($DEFAULT_G1 eq CJK) { $marc8 .= ESCAPE . MULTI_G1_A . $DEFAULT_G1; }
451					else { $marc8 .= ESCAPE . SINGLE_G1_A . $DEFAULT_G1; }
452					}
453
454					return $marc8;
455					}
456
- -
459					=head1 DEFAULT CHARACTER SETS
460
461					If you need to alter the default character sets you can set the
462					$MARC::Charset::DEFAULT_G0 and $MARC::Charset::DEFAULT_G1 variables to the
463					appropriate character set code:
464
465					use MARC::Charset::Constants qw(:all);
466					$MARC::Charset::DEFAULT_G0 = BASIC_ARABIC;
467					$MARC::Charset::DEFAULT_G1 = EXTENDED_ARABIC;
468
469					=head1 SEE ALSO
470
471					=over 4
472
473					=item * L<MARC::Charset::Constant>
474
475					=item * L<MARC::Charset::Table>
476
477					=item * L<MARC::Charset::Code>
478
479					=item * L<MARC::Charset::Compiler>
480
481					=item * L<MARC::Record>
482
483					=item * L<MARC::XML>
484
485					=back
486
487					=head1 AUTHOR
488
489					Ed Summers (ehs@pobox.com)
490
491					=cut
492
493
494					sub _process_escape
495					{
496					## this stuff is kind of scary ... for an explanation of what is
497					## going on here check out the MARC-8 specs at LC.
498					## http://lcweb.loc.gov/marc/specifications/speccharmarc8.html
499					my ($str_ref, $left, $right) = @_;
500
501					# first char needs to be an escape or else this isn't an escape sequence
502					return $left unless substr($$str_ref, $left, 1) eq ESCAPE;
503
504					## if we don't have at least one character after the escape
505					## then this can't be a character escape sequence
506					return $left if ($left+1 >= $right);
507
508					## pull off the first escape
509					my $esc_char_1 = substr($$str_ref, $left+1, 1);
510
511					## the first method of escaping to small character sets
512					if ( $esc_char_1 eq GREEK_SYMBOLS
513					or $esc_char_1 eq SUBSCRIPTS
514					or $esc_char_1 eq SUPERSCRIPTS
515					or $esc_char_1 eq ASCII_DEFAULT)
516					{
517					$G0 = $esc_char_1;
518					return $left+2;
519					}
520
521					## the second more complicated method of escaping to bigger charsets
522					return $left if $left+2 >= $right;
523
524					my $esc_char_2 = substr($$str_ref, $left+2, 1);
525					my $esc_chars = $esc_char_1 . $esc_char_2;
526
527					if ($esc_char_1 eq SINGLE_G0_A
528					or $esc_char_1 eq SINGLE_G0_B)
529					{
530					$G0 = $esc_char_2;
531					return $left+3;
532					}
533
534					elsif ($esc_char_1 eq SINGLE_G1_A
535					or $esc_char_1 eq SINGLE_G1_B)
536					{
537					$G1 = $esc_char_2;
538					return $left+3;
539					}
540
541					elsif ( $esc_char_1 eq MULTI_G0_A ) {
542					$G0 = $esc_char_2;
543					return $left+3;
544					}
545
546					elsif ($esc_chars eq MULTI_G0_B
547					and ($left+3 < $right))
548					{
549					$G0 = substr($$str_ref, $left+3, 1);
550					return $left+4;
551					}
552
553					elsif (($esc_chars eq MULTI_G1_A or $esc_chars eq MULTI_G1_B)
554					and ($left + 3 < $right))
555					{
556					$G1 = substr($$str_ref, $left+3, 1);
557					return $left+4;
558					}
559
560					# we should never get here
561					warn("seem to have fallen through in _process_escape()");
562					return $left;
563					}
564
565					sub reset_charsets
566					{
567					$G0 = $DEFAULT_G0;
568					$G1 = $DEFAULT_G1;
569					}
570
571	1	10µs			1;