Filename | /usr/share/perl5/MARC/Charset.pm |
Statements | Executed 34 statements in 2.82ms |
Calls | P | F | Exclusive Time |
Inclusive Time |
Subroutine |
---|---|---|---|---|---|
1 | 1 | 1 | 1.97ms | 4.41ms | BEGIN@12 | MARC::Charset::
1 | 1 | 1 | 1.03ms | 2.44ms | BEGIN@10 | MARC::Charset::
1 | 1 | 1 | 919µs | 11.9ms | BEGIN@13 | MARC::Charset::
1 | 1 | 1 | 30µs | 37µs | BEGIN@4 | MARC::Charset::
1 | 1 | 1 | 22µs | 168µs | BEGIN@11 | MARC::Charset::
1 | 1 | 1 | 15µs | 43µs | BEGIN@5 | MARC::Charset::
1 | 1 | 1 | 11µs | 253µs | BEGIN@14 | MARC::Charset::
1 | 1 | 1 | 11µs | 86µs | BEGIN@7 | MARC::Charset::
0 | 0 | 0 | 0s | 0s | _process_escape | MARC::Charset::
0 | 0 | 0 | 0s | 0s | assume_encoding | MARC::Charset::
0 | 0 | 0 | 0s | 0s | assume_unicode | MARC::Charset::
0 | 0 | 0 | 0s | 0s | ignore_errors | MARC::Charset::
0 | 0 | 0 | 0s | 0s | marc8_to_utf8 | MARC::Charset::
0 | 0 | 0 | 0s | 0s | reset_charsets | MARC::Charset::
0 | 0 | 0 | 0s | 0s | utf8_to_marc8 | MARC::Charset::
Line | State ments |
Time on line |
Calls | Time in subs |
Code |
---|---|---|---|---|---|
1 | package MARC::Charset; | ||||
2 | |||||
3 | 1 | 1µs | our $VERSION = '1.2'; | ||
4 | 3 | 41µs | 2 | 45µs | # spent 37µs (30+8) within MARC::Charset::BEGIN@4 which was called:
# once (30µs+8µs) by MARC::File::SAX::BEGIN@14 at line 4 # spent 37µs making 1 call to MARC::Charset::BEGIN@4
# spent 8µs making 1 call to strict::import |
5 | 3 | 36µs | 2 | 71µs | # spent 43µs (15+28) within MARC::Charset::BEGIN@5 which was called:
# once (15µs+28µs) by MARC::File::SAX::BEGIN@14 at line 5 # spent 43µs making 1 call to MARC::Charset::BEGIN@5
# spent 28µs making 1 call to warnings::import |
6 | |||||
7 | 3 | 77µs | 2 | 161µs | # spent 86µs (11+75) within MARC::Charset::BEGIN@7 which was called:
# once (11µs+75µs) by MARC::File::SAX::BEGIN@14 at line 7 # spent 86µs making 1 call to MARC::Charset::BEGIN@7
# spent 75µs making 1 call to base::import |
8 | 1 | 2µs | our @EXPORT_OK = qw(marc8_to_utf8 utf8_to_marc8); | ||
9 | |||||
10 | 3 | 225µs | 2 | 2.56ms | # spent 2.44ms (1.03+1.41) within MARC::Charset::BEGIN@10 which was called:
# once (1.03ms+1.41ms) by MARC::File::SAX::BEGIN@14 at line 10 # spent 2.44ms making 1 call to MARC::Charset::BEGIN@10
# spent 123µs making 1 call to Exporter::import |
11 | 3 | 57µs | 2 | 315µs | # spent 168µs (22+147) within MARC::Charset::BEGIN@11 which was called:
# once (22µs+147µs) by MARC::File::SAX::BEGIN@14 at line 11 # spent 168µs making 1 call to MARC::Charset::BEGIN@11
# spent 146µs making 1 call to Exporter::import |
12 | 3 | 172µs | 2 | 6.72ms | # spent 4.41ms (1.97+2.45) within MARC::Charset::BEGIN@12 which was called:
# once (1.97ms+2.45ms) by MARC::File::SAX::BEGIN@14 at line 12 # spent 4.41ms making 1 call to MARC::Charset::BEGIN@12
# spent 2.31ms making 1 call to charnames::import |
13 | 3 | 176µs | 2 | 11.9ms | # spent 11.9ms (919µs+11.0) within MARC::Charset::BEGIN@13 which was called:
# once (919µs+11.0ms) by MARC::File::SAX::BEGIN@14 at line 13 # spent 11.9ms making 1 call to MARC::Charset::BEGIN@13
# spent 4µs making 1 call to UNIVERSAL::import |
14 | 3 | 2.01ms | 2 | 495µs | # spent 253µs (11+242) within MARC::Charset::BEGIN@14 which was called:
# once (11µs+242µs) by MARC::File::SAX::BEGIN@14 at line 14 # spent 253µs making 1 call to MARC::Charset::BEGIN@14
# spent 242µs making 1 call to Exporter::import |
15 | |||||
16 | =head1 NAME | ||||
17 | |||||
- - | |||||
43 | # get the mapping table | ||||
44 | 1 | 6µs | 1 | 245µs | our $table = MARC::Charset::Table->new(); # spent 245µs making 1 call to MARC::Charset::Table::new |
45 | |||||
46 | # set default character sets | ||||
47 | # these are viewable at the package level | ||||
48 | # in case someone wants to set them | ||||
49 | 1 | 700ns | our $DEFAULT_G0 = ASCII_DEFAULT; | ||
50 | 1 | 500ns | our $DEFAULT_G1 = EXTENDED_LATIN; | ||
51 | |||||
52 | =head2 ignore_errors() | ||||
53 | |||||
- - | |||||
66 | 1 | 400ns | our $_ignore_errors = 0; | ||
67 | sub ignore_errors { | ||||
68 | my ($self,$i) = @_; | ||||
69 | $_ignore_errors = $i if (defined($i)); | ||||
70 | return $_ignore_errors; | ||||
71 | } | ||||
72 | |||||
73 | |||||
74 | =head2 assume_unicode() | ||||
75 | |||||
- - | |||||
89 | 1 | 500ns | our $_assume = ''; | ||
90 | sub assume_unicode { | ||||
91 | my ($self,$i) = @_; | ||||
92 | $_assume = 'utf8' if (defined($i) and $i); | ||||
93 | return 1 if ($_assume eq 'utf8'); | ||||
94 | } | ||||
95 | |||||
96 | |||||
97 | =head2 assume_encoding() | ||||
98 | |||||
- - | |||||
111 | sub assume_encoding { | ||||
112 | my ($self,$i) = @_; | ||||
113 | $_assume = $i if (defined($i)); | ||||
114 | return $_assume; | ||||
115 | } | ||||
116 | |||||
117 | |||||
118 | # place holders for working graphical character sets | ||||
119 | 1 | 200ns | my $G0; | ||
120 | 1 | 200ns | my $G1; | ||
121 | |||||
122 | =head2 marc8_to_utf8() | ||||
123 | |||||
- - | |||||
142 | sub marc8_to_utf8 | ||||
143 | { | ||||
144 | my ($marc8, $ignore_errors) = @_; | ||||
145 | reset_charsets(); | ||||
146 | |||||
147 | $ignore_errors = $_ignore_errors if (!defined($ignore_errors)); | ||||
148 | |||||
149 | # holder for our utf8 | ||||
150 | my $utf8 = ''; | ||||
151 | |||||
152 | my $index = 0; | ||||
153 | my $length = length($marc8); | ||||
154 | my $combining = ''; | ||||
155 | CHAR_LOOP: while ($index < $length) | ||||
156 | { | ||||
157 | # whitespace, line feeds and carriage returns just get added on unmolested | ||||
158 | if (substr($marc8, $index, 1) =~ m/(\s+|\x0A+|\x0D+)/so) | ||||
159 | { | ||||
160 | $utf8 .= $1; | ||||
161 | $index += 1; | ||||
162 | next CHAR_LOOP; | ||||
163 | } | ||||
164 | |||||
165 | # look for any escape sequences | ||||
166 | my $new_index = _process_escape(\$marc8, $index, $length); | ||||
167 | if ($new_index > $index) | ||||
168 | { | ||||
169 | $index = $new_index; | ||||
170 | next CHAR_LOOP; | ||||
171 | } | ||||
172 | |||||
173 | my $found; | ||||
174 | CHARSET_LOOP: foreach my $charset ($G0, $G1) | ||||
175 | { | ||||
176 | |||||
177 | # cjk characters are a string of three chars | ||||
178 | my $char_size = $charset eq CJK ? 3 : 1; | ||||
179 | |||||
180 | # extract the next code point to examine | ||||
181 | my $chunk = substr($marc8, $index, $char_size); | ||||
182 | |||||
183 | # look up the character to see if it's in our mapping | ||||
184 | my $code = $table->lookup_by_marc8($charset, $chunk); | ||||
185 | |||||
186 | # try the next character set if no mapping was found | ||||
187 | next CHARSET_LOOP if ! $code; | ||||
188 | $found = 1; | ||||
189 | |||||
190 | # gobble up all combining characters for appending later | ||||
191 | # this is necessary because combinging characters precede | ||||
192 | # the character they modifiy in MARC-8, whereas they follow | ||||
193 | # the character they modify in UTF-8. | ||||
194 | if ($code->is_combining()) | ||||
195 | { | ||||
196 | $combining .= $code->char_value(); | ||||
197 | } | ||||
198 | else | ||||
199 | { | ||||
200 | $utf8 .= $code->char_value() . $combining; | ||||
201 | $combining = ''; | ||||
202 | } | ||||
203 | |||||
204 | $index += $char_size; | ||||
205 | next CHAR_LOOP; | ||||
206 | } | ||||
207 | |||||
208 | if (!$found) | ||||
209 | { | ||||
210 | warn(sprintf("no mapping found for [0x\%X] at position $index in $marc8 ". | ||||
211 | "g0=".MARC::Charset::Constants::charset_name($G0) . " " . | ||||
212 | "g1=".MARC::Charset::Constants::charset_name($G1), unpack('C',substr($marc8,$index,1)))); | ||||
213 | if (!$ignore_errors) | ||||
214 | { | ||||
215 | reset_charsets(); | ||||
216 | return; | ||||
217 | } | ||||
218 | if ($_assume) | ||||
219 | { | ||||
220 | reset_charsets(); | ||||
221 | return NFC(decode($_assume => $marc8)); | ||||
222 | } | ||||
223 | $index += 1; | ||||
224 | } | ||||
225 | |||||
226 | } | ||||
227 | |||||
228 | # return the utf8 | ||||
229 | reset_charsets(); | ||||
230 | return $utf8; | ||||
231 | } | ||||
232 | |||||
- - | |||||
235 | =head2 utf8_to_marc8() | ||||
236 | |||||
- - | |||||
254 | sub utf8_to_marc8 | ||||
255 | { | ||||
256 | my ($utf8, $ignore_errors) = @_; | ||||
257 | reset_charsets(); | ||||
258 | |||||
259 | $ignore_errors = $_ignore_errors if (!defined($ignore_errors)); | ||||
260 | |||||
261 | # decompose combined characters | ||||
262 | $utf8 = NFD($utf8); | ||||
263 | |||||
264 | my $len = length($utf8); | ||||
265 | my $marc8 = ''; | ||||
266 | for (my $i=0; $i<$len; $i++) | ||||
267 | { | ||||
268 | my $slice = substr($utf8, $i, 1); | ||||
269 | |||||
270 | # spaces are copied from utf8 into marc8 | ||||
271 | if ($slice eq ' ') | ||||
272 | { | ||||
273 | $marc8 .= ' '; | ||||
274 | next; | ||||
275 | } | ||||
276 | |||||
277 | # try to find the code point in our mapping table | ||||
278 | my $code = $table->lookup_by_utf8($slice); | ||||
279 | |||||
280 | if (! $code) | ||||
281 | { | ||||
282 | warn("no mapping found at position $i in $utf8"); | ||||
283 | reset_charsets() and return unless $ignore_errors; | ||||
284 | } | ||||
285 | |||||
286 | # if it's a combining character move it around | ||||
287 | if ($code->is_combining()) | ||||
288 | { | ||||
289 | my $prev = chop($marc8); | ||||
290 | $marc8 .= $code->marc_value() . $prev; | ||||
291 | next; | ||||
292 | } | ||||
293 | |||||
294 | # look to see if we need to escape to a new G0 charset | ||||
295 | my $charset_value = $code->charset_value(); | ||||
296 | |||||
297 | if ($code->default_charset_group() eq 'G0' | ||||
298 | and $G0 ne $charset_value) | ||||
299 | { | ||||
300 | if ($G0 eq ASCII_DEFAULT and $charset_value eq BASIC_LATIN) | ||||
301 | { | ||||
302 | # don't bother escaping, they're functionally the same | ||||
303 | } | ||||
304 | else | ||||
305 | { | ||||
306 | $marc8 .= $code->get_escape(); | ||||
307 | $G0 = $charset_value; | ||||
308 | } | ||||
309 | } | ||||
310 | |||||
311 | # look to see if we need to escape to a new G1 charset | ||||
312 | elsif ($code->default_charset_group() eq 'G1' | ||||
313 | and $G1 ne $charset_value) | ||||
314 | { | ||||
315 | $marc8 .= $code->get_escape(); | ||||
316 | $G1 = $charset_value; | ||||
317 | } | ||||
318 | |||||
319 | $marc8 .= $code->marc_value(); | ||||
320 | } | ||||
321 | |||||
322 | # escape back to default G0 if necessary | ||||
323 | if ($G0 ne $DEFAULT_G0) | ||||
324 | { | ||||
325 | if ($DEFAULT_G0 eq ASCII_DEFAULT) { $marc8 .= ESCAPE . ASCII_DEFAULT; } | ||||
326 | elsif ($DEFAULT_G0 eq CJK) { $marc8 .= ESCAPE . MULTI_G0_A . CJK; } | ||||
327 | else { $marc8 .= ESCAPE . SINGLE_G0_A . $DEFAULT_G0; } | ||||
328 | } | ||||
329 | |||||
330 | # escape back to default G1 if necessary | ||||
331 | if ($G1 ne $DEFAULT_G1) | ||||
332 | { | ||||
333 | if ($DEFAULT_G1 eq CJK) { $marc8 .= ESCAPE . MULTI_G1_A . $DEFAULT_G1; } | ||||
334 | else { $marc8 .= ESCAPE . SINGLE_G1_A . $DEFAULT_G1; } | ||||
335 | } | ||||
336 | |||||
337 | return $marc8; | ||||
338 | } | ||||
339 | |||||
- - | |||||
342 | =head1 DEFAULT CHARACTER SETS | ||||
343 | |||||
- - | |||||
377 | sub _process_escape | ||||
378 | { | ||||
379 | ## this stuff is kind of scary ... for an explanation of what is | ||||
380 | ## going on here check out the MARC-8 specs at LC. | ||||
381 | ## http://lcweb.loc.gov/marc/specifications/speccharmarc8.html | ||||
382 | my ($str_ref, $left, $right) = @_; | ||||
383 | |||||
384 | # first char needs to be an escape or else this isn't an escape sequence | ||||
385 | return $left unless substr($$str_ref, $left, 1) eq ESCAPE; | ||||
386 | |||||
387 | ## if we don't have at least one character after the escape | ||||
388 | ## then this can't be a character escape sequence | ||||
389 | return $left if ($left+1 >= $right); | ||||
390 | |||||
391 | ## pull off the first escape | ||||
392 | my $esc_char_1 = substr($$str_ref, $left+1, 1); | ||||
393 | |||||
394 | ## the first method of escaping to small character sets | ||||
395 | if ( $esc_char_1 eq GREEK_SYMBOLS | ||||
396 | or $esc_char_1 eq SUBSCRIPTS | ||||
397 | or $esc_char_1 eq SUPERSCRIPTS | ||||
398 | or $esc_char_1 eq ASCII_DEFAULT) | ||||
399 | { | ||||
400 | $G0 = $esc_char_1; | ||||
401 | return $left+2; | ||||
402 | } | ||||
403 | |||||
404 | ## the second more complicated method of escaping to bigger charsets | ||||
405 | return $left if $left+2 >= $right; | ||||
406 | |||||
407 | my $esc_char_2 = substr($$str_ref, $left+2, 1); | ||||
408 | my $esc_chars = $esc_char_1 . $esc_char_2; | ||||
409 | |||||
410 | if ($esc_char_1 eq SINGLE_G0_A | ||||
411 | or $esc_char_1 eq SINGLE_G0_B) | ||||
412 | { | ||||
413 | $G0 = $esc_char_2; | ||||
414 | return $left+3; | ||||
415 | } | ||||
416 | |||||
417 | elsif ($esc_char_1 eq SINGLE_G1_A | ||||
418 | or $esc_char_1 eq SINGLE_G1_B) | ||||
419 | { | ||||
420 | $G1 = $esc_char_2; | ||||
421 | return $left+3; | ||||
422 | } | ||||
423 | |||||
424 | elsif ( $esc_char_1 eq MULTI_G0_A ) { | ||||
425 | $G0 = $esc_char_2; | ||||
426 | return $left+3; | ||||
427 | } | ||||
428 | |||||
429 | elsif ($esc_chars eq MULTI_G0_B | ||||
430 | and ($left+3 < $right)) | ||||
431 | { | ||||
432 | $G0 = substr($$str_ref, $left+3, 1); | ||||
433 | return $left+4; | ||||
434 | } | ||||
435 | |||||
436 | elsif (($esc_chars eq MULTI_G1_A or $esc_chars eq MULTI_G1_B) | ||||
437 | and ($left + 3 < $right)) | ||||
438 | { | ||||
439 | $G1 = substr($$str_ref, $left+3, 1); | ||||
440 | return $left+4; | ||||
441 | } | ||||
442 | |||||
443 | # we should never get here | ||||
444 | warn("seem to have fallen through in _process_escape()"); | ||||
445 | return $left; | ||||
446 | } | ||||
447 | |||||
448 | sub reset_charsets | ||||
449 | { | ||||
450 | $G0 = $DEFAULT_G0; | ||||
451 | $G1 = $DEFAULT_G1; | ||||
452 | } | ||||
453 | |||||
454 | 1 | 14µs | 1; |