Filename | /usr/share/perl5/MARC/Charset.pm |
Statements | Executed 26 statements in 4.42ms |
Calls | P | F | Exclusive Time |
Inclusive Time |
Subroutine |
---|---|---|---|---|---|
1 | 1 | 1 | 1.59ms | 13.1ms | BEGIN@14 | MARC::Charset::
1 | 1 | 1 | 1.40ms | 2.31ms | BEGIN@11 | MARC::Charset::
1 | 1 | 1 | 1.04ms | 25.9ms | BEGIN@13 | MARC::Charset::
1 | 1 | 1 | 11µs | 22µs | BEGIN@3 | MARC::Charset::
1 | 1 | 1 | 8µs | 28µs | BEGIN@12 | MARC::Charset::
1 | 1 | 1 | 8µs | 170µs | BEGIN@15 | MARC::Charset::
1 | 1 | 1 | 7µs | 12µs | BEGIN@4 | MARC::Charset::
1 | 1 | 1 | 7µs | 64µs | BEGIN@8 | MARC::Charset::
0 | 0 | 0 | 0s | 0s | _process_escape | MARC::Charset::
0 | 0 | 0 | 0s | 0s | assume_encoding | MARC::Charset::
0 | 0 | 0 | 0s | 0s | assume_unicode | MARC::Charset::
0 | 0 | 0 | 0s | 0s | ignore_errors | MARC::Charset::
0 | 0 | 0 | 0s | 0s | marc8_to_utf8 | MARC::Charset::
0 | 0 | 0 | 0s | 0s | reset_charsets | MARC::Charset::
0 | 0 | 0 | 0s | 0s | utf8_to_marc8 | MARC::Charset::
Line | State ments |
Time on line |
Calls | Time in subs |
Code |
---|---|---|---|---|---|
1 | package MARC::Charset; | ||||
2 | |||||
3 | 2 | 22µs | 2 | 34µs | # spent 22µs (11+12) within MARC::Charset::BEGIN@3 which was called:
# once (11µs+12µs) by MARC::File::XML::BEGIN@11 at line 3 # spent 22µs making 1 call to MARC::Charset::BEGIN@3
# spent 12µs making 1 call to strict::import |
4 | 2 | 31µs | 2 | 16µs | # spent 12µs (7+5) within MARC::Charset::BEGIN@4 which was called:
# once (7µs+5µs) by MARC::File::XML::BEGIN@11 at line 4 # spent 12µs making 1 call to MARC::Charset::BEGIN@4
# spent 5µs making 1 call to warnings::import |
5 | |||||
6 | 1 | 700ns | our $VERSION = '1.35'; | ||
7 | |||||
8 | 2 | 33µs | 2 | 122µs | # spent 64µs (7+58) within MARC::Charset::BEGIN@8 which was called:
# once (7µs+58µs) by MARC::File::XML::BEGIN@11 at line 8 # spent 64µs making 1 call to MARC::Charset::BEGIN@8
# spent 58µs making 1 call to base::import |
9 | 1 | 1µs | our @EXPORT_OK = qw(marc8_to_utf8 utf8_to_marc8); | ||
10 | |||||
11 | 2 | 755µs | 2 | 2.34ms | # spent 2.31ms (1.40+902µs) within MARC::Charset::BEGIN@11 which was called:
# once (1.40ms+902µs) by MARC::File::XML::BEGIN@11 at line 11 # spent 2.31ms making 1 call to MARC::Charset::BEGIN@11
# spent 28µs making 1 call to Exporter::import |
12 | 2 | 24µs | 2 | 47µs | # spent 28µs (8+20) within MARC::Charset::BEGIN@12 which was called:
# once (8µs+20µs) by MARC::File::XML::BEGIN@11 at line 12 # spent 28µs making 1 call to MARC::Charset::BEGIN@12
# spent 20µs making 1 call to Exporter::import |
13 | 2 | 665µs | 2 | 26.0ms | # spent 25.9ms (1.04+24.9) within MARC::Charset::BEGIN@13 which was called:
# once (1.04ms+24.9ms) by MARC::File::XML::BEGIN@11 at line 13 # spent 25.9ms making 1 call to MARC::Charset::BEGIN@13
# spent 144µs making 1 call to charnames::import |
14 | 2 | 1.07ms | 1 | 13.1ms | # spent 13.1ms (1.59+11.6) within MARC::Charset::BEGIN@14 which was called:
# once (1.59ms+11.6ms) by MARC::File::XML::BEGIN@11 at line 14 # spent 13.1ms making 1 call to MARC::Charset::BEGIN@14 |
15 | 2 | 1.79ms | 2 | 332µs | # spent 170µs (8+162) within MARC::Charset::BEGIN@15 which was called:
# once (8µs+162µs) by MARC::File::XML::BEGIN@11 at line 15 # spent 170µs making 1 call to MARC::Charset::BEGIN@15
# spent 162µs making 1 call to Exporter::import |
16 | |||||
17 | =head1 NAME | ||||
18 | |||||
19 | MARC::Charset - convert MARC-8 encoded strings to UTF-8 | ||||
20 | |||||
21 | =head1 SYNOPSIS | ||||
22 | |||||
23 | # import the marc8_to_utf8 function | ||||
24 | use MARC::Charset 'marc8_to_utf8'; | ||||
25 | |||||
26 | # prepare STDOUT for utf8 | ||||
27 | binmode(STDOUT, 'utf8'); | ||||
28 | |||||
29 | # print out some marc8 as utf8 | ||||
30 | print marc8_to_utf8($marc8_string); | ||||
31 | |||||
32 | =head1 DESCRIPTION | ||||
33 | |||||
34 | MARC::Charset allows you to turn MARC-8 encoded strings into UTF-8 | ||||
35 | strings. MARC-8 is a single byte character encoding that predates unicode, and | ||||
36 | allows you to put non-Roman scripts in MARC bibliographic records. | ||||
37 | |||||
38 | http://www.loc.gov/marc/specifications/spechome.html | ||||
39 | |||||
40 | =head1 EXPORTS | ||||
41 | |||||
42 | =cut | ||||
43 | |||||
44 | # get the mapping table | ||||
45 | 1 | 2µs | 1 | 69µs | our $table = MARC::Charset::Table->new(); # spent 69µs making 1 call to MARC::Charset::Table::new |
46 | |||||
47 | # set default character sets | ||||
48 | # these are viewable at the package level | ||||
49 | # in case someone wants to set them | ||||
50 | 1 | 300ns | our $DEFAULT_G0 = ASCII_DEFAULT; | ||
51 | 1 | 100ns | our $DEFAULT_G1 = EXTENDED_LATIN; | ||
52 | |||||
53 | 1 | 23µs | our %SPECIAL_DECOMPOSABLE = ( | ||
54 | chr(0x01a0) => chr(0x01a0), # uppercase o-hook | ||||
55 | chr(0x01af) => chr(0x01af), # uppercase u-hook | ||||
56 | chr(0x01a1) => chr(0x01a1), # lowercase o-hook | ||||
57 | chr(0x01b0) => chr(0x01b0), # lowercase u-hook | ||||
58 | chr(0x1ef1) => chr(0x01b0) . chr(0x0323), # lowercase u-hook with dot below | ||||
59 | chr(0x1ee9) => chr(0x01b0) . chr(0x0301), # lowercase u-hook with acute | ||||
60 | # Arabic to not decompose | ||||
61 | chr(0x0622) => chr(0x0622), | ||||
62 | chr(0x0623) => chr(0x0623), | ||||
63 | chr(0x0624) => chr(0x0624), | ||||
64 | chr(0x0625) => chr(0x0625), | ||||
65 | chr(0x0626) => chr(0x0626), | ||||
66 | chr(0x0649) => chr(0x0649), | ||||
67 | chr(0x0671) => chr(0x0671), | ||||
68 | chr(0x06c0) => chr(0x06c0), | ||||
69 | chr(0x06D3) => chr(0x06D3), | ||||
70 | # Cyrillic to not decompose | ||||
71 | chr(0x0439) => chr(0x0439), | ||||
72 | chr(0x0419) => chr(0x0419), | ||||
73 | chr(0x0453) => chr(0x0453), | ||||
74 | chr(0x0451) => chr(0x0451), | ||||
75 | chr(0x0457) => chr(0x0457), | ||||
76 | chr(0x045C) => chr(0x045C), | ||||
77 | chr(0x045E) => chr(0x045E), | ||||
78 | chr(0x0403) => chr(0x0403), | ||||
79 | chr(0x0401) => chr(0x0401), | ||||
80 | chr(0x0407) => chr(0x0407), | ||||
81 | chr(0x040C) => chr(0x040C), | ||||
82 | chr(0x040E) => chr(0x040E), | ||||
83 | # Katakana to not decompose | ||||
84 | chr(0x309B) => chr(0x309B), | ||||
85 | chr(0x309C) => chr(0x309C), | ||||
86 | chr(0x30AC) => chr(0x30AC), | ||||
87 | chr(0x30AE) => chr(0x30AE), | ||||
88 | chr(0x30B0) => chr(0x30B0), | ||||
89 | chr(0x30B2) => chr(0x30B2), | ||||
90 | chr(0x30B4) => chr(0x30B4), | ||||
91 | chr(0x30B6) => chr(0x30B6), | ||||
92 | chr(0x30B8) => chr(0x30B8), | ||||
93 | chr(0x30BA) => chr(0x30BA), | ||||
94 | chr(0x30BC) => chr(0x30BC), | ||||
95 | chr(0x30BE) => chr(0x30BE), | ||||
96 | chr(0x30C0) => chr(0x30C0), | ||||
97 | chr(0x30C2) => chr(0x30C2), | ||||
98 | chr(0x30C5) => chr(0x30C5), | ||||
99 | chr(0x30C7) => chr(0x30C7), | ||||
100 | chr(0x30C9) => chr(0x30C9), | ||||
101 | chr(0x30D0) => chr(0x30D0), | ||||
102 | chr(0x30D1) => chr(0x30D1), | ||||
103 | chr(0x30D3) => chr(0x30D3), | ||||
104 | chr(0x30D4) => chr(0x30D4), | ||||
105 | chr(0x30D6) => chr(0x30D6), | ||||
106 | chr(0x30D7) => chr(0x30D7), | ||||
107 | chr(0x30D9) => chr(0x30D9), | ||||
108 | chr(0x30DA) => chr(0x30DA), | ||||
109 | chr(0x30DC) => chr(0x30DC), | ||||
110 | chr(0x30DD) => chr(0x30DD), | ||||
111 | chr(0x30F4) => chr(0x30F4), | ||||
112 | chr(0x30F7) => chr(0x30F7), | ||||
113 | chr(0x30F8) => chr(0x30F8), | ||||
114 | chr(0x30F9) => chr(0x30F9), | ||||
115 | chr(0x30FA) => chr(0x30FA), | ||||
116 | chr(0x30FE) => chr(0x30FE), | ||||
117 | chr(0x30FF) => chr(0x30FF), | ||||
118 | ); | ||||
119 | |||||
120 | =head2 ignore_errors() | ||||
121 | |||||
122 | Tells MARC::Charset whether or not to ignore all encoding errors, and | ||||
123 | returns the current setting. This is helpful if you have records that | ||||
124 | contain both MARC8 and UNICODE characters. | ||||
125 | |||||
126 | my $ignore = MARC::Charset->ignore_errors(); | ||||
127 | |||||
128 | MARC::Charset->ignore_errors(1); # ignore errors | ||||
129 | MARC::Charset->ignore_errors(0); # DO NOT ignore errors | ||||
130 | |||||
131 | =cut | ||||
132 | |||||
133 | |||||
134 | 1 | 200ns | our $_ignore_errors = 0; | ||
135 | sub ignore_errors { | ||||
136 | my ($self,$i) = @_; | ||||
137 | $_ignore_errors = $i if (defined($i)); | ||||
138 | return $_ignore_errors; | ||||
139 | } | ||||
140 | |||||
141 | |||||
142 | =head2 assume_unicode() | ||||
143 | |||||
144 | Tells MARC::Charset whether or not to assume UNICODE when an error is | ||||
145 | encountered in ignore_errors mode and returns the current setting. | ||||
146 | This is helpful if you have records that contain both MARC8 and UNICODE | ||||
147 | characters. | ||||
148 | |||||
149 | my $setting = MARC::Charset->assume_unicode(); | ||||
150 | |||||
151 | MARC::Charset->assume_unicode(1); # assume characters are unicode (utf-8) | ||||
152 | MARC::Charset->assume_unicode(0); # DO NOT assume characters are unicode | ||||
153 | |||||
154 | =cut | ||||
155 | |||||
156 | |||||
157 | 1 | 100ns | our $_assume = ''; | ||
158 | sub assume_unicode { | ||||
159 | my ($self,$i) = @_; | ||||
160 | $_assume = 'utf8' if (defined($i) and $i); | ||||
161 | return 1 if ($_assume eq 'utf8'); | ||||
162 | } | ||||
163 | |||||
164 | |||||
165 | =head2 assume_encoding() | ||||
166 | |||||
167 | Tells MARC::Charset whether or not to assume a specific encoding when an error | ||||
168 | is encountered in ignore_errors mode and returns the current setting. This | ||||
169 | is helpful if you have records that contain both MARC8 and other characters. | ||||
170 | |||||
171 | my $setting = MARC::Charset->assume_encoding(); | ||||
172 | |||||
173 | MARC::Charset->assume_encoding('cp850'); # assume characters are cp850 | ||||
174 | MARC::Charset->assume_encoding(''); # DO NOT assume any encoding | ||||
175 | |||||
176 | =cut | ||||
177 | |||||
178 | |||||
179 | sub assume_encoding { | ||||
180 | my ($self,$i) = @_; | ||||
181 | $_assume = $i if (defined($i)); | ||||
182 | return $_assume; | ||||
183 | } | ||||
184 | |||||
185 | |||||
186 | # place holders for working graphical character sets | ||||
187 | 1 | 100ns | my $G0; | ||
188 | my $G1; | ||||
189 | |||||
190 | =head2 marc8_to_utf8() | ||||
191 | |||||
192 | Converts a MARC-8 encoded string to UTF-8. | ||||
193 | |||||
194 | my $utf8 = marc8_to_utf8($marc8); | ||||
195 | |||||
196 | If you'd like to ignore errors pass in a true value as the 2nd | ||||
197 | parameter or call MARC::Charset->ignore_errors() with a true | ||||
198 | value: | ||||
199 | |||||
200 | my $utf8 = marc8_to_utf8($marc8, 'ignore-errors'); | ||||
201 | |||||
202 | or | ||||
203 | |||||
204 | MARC::Charset->ignore_errors(1); | ||||
205 | my $utf8 = marc8_to_utf8($marc8); | ||||
206 | |||||
207 | =cut | ||||
208 | |||||
209 | |||||
210 | sub marc8_to_utf8 | ||||
211 | { | ||||
212 | my ($marc8, $ignore_errors) = @_; | ||||
213 | reset_charsets(); | ||||
214 | |||||
215 | $ignore_errors = $_ignore_errors if (!defined($ignore_errors)); | ||||
216 | |||||
217 | # holder for our utf8 | ||||
218 | my $utf8 = ''; | ||||
219 | |||||
220 | my $index = 0; | ||||
221 | my $length = length($marc8); | ||||
222 | my $combining = ''; | ||||
223 | CHAR_LOOP: while ($index < $length) | ||||
224 | { | ||||
225 | # whitespace, line feeds and carriage returns just get added on unmolested | ||||
226 | if (substr($marc8, $index, 1) =~ m/(\s+|\x0A+|\x0D+)/so) | ||||
227 | { | ||||
228 | $utf8 .= $1; | ||||
229 | $index += 1; | ||||
230 | next CHAR_LOOP; | ||||
231 | } | ||||
232 | |||||
233 | # look for any escape sequences | ||||
234 | my $new_index = _process_escape(\$marc8, $index, $length); | ||||
235 | if ($new_index > $index) | ||||
236 | { | ||||
237 | $index = $new_index; | ||||
238 | next CHAR_LOOP; | ||||
239 | } | ||||
240 | |||||
241 | my $found; | ||||
242 | CHARSET_LOOP: foreach my $charset ($G0, $G1) | ||||
243 | { | ||||
244 | |||||
245 | # cjk characters are a string of three chars | ||||
246 | my $char_size = $charset eq CJK ? 3 : 1; | ||||
247 | |||||
248 | # extract the next code point to examine | ||||
249 | my $chunk = substr($marc8, $index, $char_size); | ||||
250 | |||||
251 | my $code; | ||||
252 | if ($char_size == 1) { | ||||
253 | my $codepoint = ord($chunk); | ||||
254 | if ($codepoint >= 0x21 && $codepoint <= 0x7e) { | ||||
255 | # character is G0 | ||||
256 | $code = $table->lookup_by_marc8($G0, $chunk); | ||||
257 | } elsif ($codepoint >= 0xa1 && $codepoint <= 0xfe) { | ||||
258 | # character is G1, map it to G0 before atttempting lookup | ||||
259 | $code = $table->lookup_by_marc8($G1, chr($codepoint - 128)); | ||||
260 | } elsif ($codepoint >= 0x88 && $codepoint <= 0x8e) { | ||||
261 | # in the C1 range used by MARC8 | ||||
262 | $code = $table->lookup_by_marc8(EXTENDED_LATIN, $chunk); | ||||
263 | } elsif ($codepoint >= 0x1b && $codepoint <= 0x1f) { | ||||
264 | # in the C0 range used by MARC8 | ||||
265 | $code = $table->lookup_by_marc8(BASIC_LATIN, $chunk); | ||||
266 | } | ||||
267 | } else { | ||||
268 | # EACC doesn't need G0/G1 conversion | ||||
269 | $code = $table->lookup_by_marc8($charset, $chunk); | ||||
270 | } | ||||
271 | |||||
272 | # try the next character set if no mapping was found | ||||
273 | next CHARSET_LOOP if ! $code; | ||||
274 | $found = 1; | ||||
275 | |||||
276 | # gobble up all combining characters for appending later | ||||
277 | # this is necessary because combinging characters precede | ||||
278 | # the character they modify in MARC-8, whereas they follow | ||||
279 | # the character they modify in UTF-8. | ||||
280 | if ($code->is_combining()) | ||||
281 | { | ||||
282 | # If the current character is the right half of a MARC-8 | ||||
283 | # ligature or double tilde, we don't want to include | ||||
284 | # it in the UTF-8 output. For the explanation, see | ||||
285 | # http://lcweb2.loc.gov/diglib/codetables/45.html#Note1 | ||||
286 | # Note that if the MARC-8 string includes a right half | ||||
287 | # without the corresponding left half, the right half will | ||||
288 | # get dropped instead of being mapped to its UCS alternate. | ||||
289 | # That's OK since including only one half of a double diacritic | ||||
290 | # was presumably a mistake to begin with. | ||||
291 | unless (defined $code->marc_left_half()) | ||||
292 | { | ||||
293 | $combining .= $code->char_value(); | ||||
294 | } | ||||
295 | } | ||||
296 | else | ||||
297 | { | ||||
298 | $utf8 .= $code->char_value() . $combining; | ||||
299 | $combining = ''; | ||||
300 | } | ||||
301 | |||||
302 | $index += $char_size; | ||||
303 | next CHAR_LOOP; | ||||
304 | } | ||||
305 | |||||
306 | if (!$found) | ||||
307 | { | ||||
308 | warn(sprintf("no mapping found for [0x\%X] at position $index in $marc8 ". | ||||
309 | "g0=".MARC::Charset::Constants::charset_name($G0) . " " . | ||||
310 | "g1=".MARC::Charset::Constants::charset_name($G1), unpack('C',substr($marc8,$index,1)))); | ||||
311 | if (!$ignore_errors) | ||||
312 | { | ||||
313 | reset_charsets(); | ||||
314 | return; | ||||
315 | } | ||||
316 | if ($_assume) | ||||
317 | { | ||||
318 | reset_charsets(); | ||||
319 | return NFC(decode($_assume => $marc8)); | ||||
320 | } | ||||
321 | $index += 1; | ||||
322 | } | ||||
323 | |||||
324 | } | ||||
325 | |||||
326 | # return the utf8 | ||||
327 | reset_charsets(); | ||||
328 | utf8::upgrade($utf8); | ||||
329 | return $utf8; | ||||
330 | } | ||||
331 | |||||
- - | |||||
334 | =head2 utf8_to_marc8() | ||||
335 | |||||
336 | Will attempt to translate utf8 into marc8. | ||||
337 | |||||
338 | my $marc8 = utf8_to_marc8($utf8); | ||||
339 | |||||
340 | If you'd like to ignore errors, or characters that can't be | ||||
341 | converted to marc8 then pass in a true value as the second | ||||
342 | parameter: | ||||
343 | |||||
344 | my $marc8 = utf8_to_marc8($utf8, 'ignore-errors'); | ||||
345 | |||||
346 | or | ||||
347 | |||||
348 | MARC::Charset->ignore_errors(1); | ||||
349 | my $utf8 = marc8_to_utf8($marc8); | ||||
350 | |||||
351 | =cut | ||||
352 | |||||
353 | sub utf8_to_marc8 | ||||
354 | { | ||||
355 | my ($utf8, $ignore_errors) = @_; | ||||
356 | reset_charsets(); | ||||
357 | |||||
358 | $ignore_errors = $_ignore_errors if (!defined($ignore_errors)); | ||||
359 | |||||
360 | # decompose combined characters | ||||
361 | $utf8 = join('', | ||||
362 | map { exists $SPECIAL_DECOMPOSABLE{$_} ? $SPECIAL_DECOMPOSABLE{$_} : NFD($_) } | ||||
363 | split //, $utf8 | ||||
364 | ); | ||||
365 | |||||
366 | my $len = length($utf8); | ||||
367 | my $marc8 = ''; | ||||
368 | for (my $i=0; $i<$len; $i++) | ||||
369 | { | ||||
370 | my $slice = substr($utf8, $i, 1); | ||||
371 | |||||
372 | # spaces are copied from utf8 into marc8 | ||||
373 | if ($slice eq ' ') | ||||
374 | { | ||||
375 | $marc8 .= ' '; | ||||
376 | next; | ||||
377 | } | ||||
378 | |||||
379 | # try to find the code point in our mapping table | ||||
380 | my $code = $table->lookup_by_utf8($slice); | ||||
381 | |||||
382 | if (! $code) | ||||
383 | { | ||||
384 | warn("no mapping found at position $i in $utf8"); | ||||
385 | reset_charsets() and return unless $ignore_errors; | ||||
386 | } | ||||
387 | |||||
388 | # if it's a combining character move it around | ||||
389 | if ($code->is_combining()) | ||||
390 | { | ||||
391 | my $prev = chop($marc8); | ||||
392 | if ($code->marc_left_half()) | ||||
393 | { | ||||
394 | # don't add the MARC-8 right half character | ||||
395 | # if it was already inserted when the double | ||||
396 | # diacritic was converted from UTF-8 | ||||
397 | if ($code->marc_value() eq substr($marc8, -1, 1)) | ||||
398 | { | ||||
399 | $marc8 .= $prev; | ||||
400 | next; | ||||
401 | } | ||||
402 | } | ||||
403 | $marc8 .= $code->marc_value() . $prev; | ||||
404 | if ($code->marc_right_half()) | ||||
405 | { | ||||
406 | $marc8 .= chr(hex($code->marc_right_half())); | ||||
407 | } | ||||
408 | next; | ||||
409 | } | ||||
410 | |||||
411 | # look to see if we need to escape to a new G0 charset | ||||
412 | my $charset_value = $code->charset_value(); | ||||
413 | |||||
414 | if ($code->default_charset_group() eq 'G0' | ||||
415 | and $G0 ne $charset_value) | ||||
416 | { | ||||
417 | if ($G0 eq ASCII_DEFAULT and $charset_value eq BASIC_LATIN) | ||||
418 | { | ||||
419 | # don't bother escaping, they're functionally the same | ||||
420 | } | ||||
421 | else | ||||
422 | { | ||||
423 | $marc8 .= $code->get_escape(); | ||||
424 | $G0 = $charset_value; | ||||
425 | } | ||||
426 | } | ||||
427 | |||||
428 | # look to see if we need to escape to a new G1 charset | ||||
429 | elsif ($code->default_charset_group() eq 'G1' | ||||
430 | and $G1 ne $charset_value) | ||||
431 | { | ||||
432 | $marc8 .= $code->get_escape(); | ||||
433 | $G1 = $charset_value; | ||||
434 | } | ||||
435 | |||||
436 | $marc8 .= $code->marc_value(); | ||||
437 | } | ||||
438 | |||||
439 | # escape back to default G0 if necessary | ||||
440 | if ($G0 ne $DEFAULT_G0) | ||||
441 | { | ||||
442 | if ($DEFAULT_G0 eq ASCII_DEFAULT) { $marc8 .= ESCAPE . ASCII_DEFAULT; } | ||||
443 | elsif ($DEFAULT_G0 eq CJK) { $marc8 .= ESCAPE . MULTI_G0_A . CJK; } | ||||
444 | else { $marc8 .= ESCAPE . SINGLE_G0_A . $DEFAULT_G0; } | ||||
445 | } | ||||
446 | |||||
447 | # escape back to default G1 if necessary | ||||
448 | if ($G1 ne $DEFAULT_G1) | ||||
449 | { | ||||
450 | if ($DEFAULT_G1 eq CJK) { $marc8 .= ESCAPE . MULTI_G1_A . $DEFAULT_G1; } | ||||
451 | else { $marc8 .= ESCAPE . SINGLE_G1_A . $DEFAULT_G1; } | ||||
452 | } | ||||
453 | |||||
454 | return $marc8; | ||||
455 | } | ||||
456 | |||||
- - | |||||
459 | =head1 DEFAULT CHARACTER SETS | ||||
460 | |||||
461 | If you need to alter the default character sets you can set the | ||||
462 | $MARC::Charset::DEFAULT_G0 and $MARC::Charset::DEFAULT_G1 variables to the | ||||
463 | appropriate character set code: | ||||
464 | |||||
465 | use MARC::Charset::Constants qw(:all); | ||||
466 | $MARC::Charset::DEFAULT_G0 = BASIC_ARABIC; | ||||
467 | $MARC::Charset::DEFAULT_G1 = EXTENDED_ARABIC; | ||||
468 | |||||
469 | =head1 SEE ALSO | ||||
470 | |||||
471 | =over 4 | ||||
472 | |||||
473 | =item * L<MARC::Charset::Constant> | ||||
474 | |||||
475 | =item * L<MARC::Charset::Table> | ||||
476 | |||||
477 | =item * L<MARC::Charset::Code> | ||||
478 | |||||
479 | =item * L<MARC::Charset::Compiler> | ||||
480 | |||||
481 | =item * L<MARC::Record> | ||||
482 | |||||
483 | =item * L<MARC::XML> | ||||
484 | |||||
485 | =back | ||||
486 | |||||
487 | =head1 AUTHOR | ||||
488 | |||||
489 | Ed Summers (ehs@pobox.com) | ||||
490 | |||||
491 | =cut | ||||
492 | |||||
493 | |||||
494 | sub _process_escape | ||||
495 | { | ||||
496 | ## this stuff is kind of scary ... for an explanation of what is | ||||
497 | ## going on here check out the MARC-8 specs at LC. | ||||
498 | ## http://lcweb.loc.gov/marc/specifications/speccharmarc8.html | ||||
499 | my ($str_ref, $left, $right) = @_; | ||||
500 | |||||
501 | # first char needs to be an escape or else this isn't an escape sequence | ||||
502 | return $left unless substr($$str_ref, $left, 1) eq ESCAPE; | ||||
503 | |||||
504 | ## if we don't have at least one character after the escape | ||||
505 | ## then this can't be a character escape sequence | ||||
506 | return $left if ($left+1 >= $right); | ||||
507 | |||||
508 | ## pull off the first escape | ||||
509 | my $esc_char_1 = substr($$str_ref, $left+1, 1); | ||||
510 | |||||
511 | ## the first method of escaping to small character sets | ||||
512 | if ( $esc_char_1 eq GREEK_SYMBOLS | ||||
513 | or $esc_char_1 eq SUBSCRIPTS | ||||
514 | or $esc_char_1 eq SUPERSCRIPTS | ||||
515 | or $esc_char_1 eq ASCII_DEFAULT) | ||||
516 | { | ||||
517 | $G0 = $esc_char_1; | ||||
518 | return $left+2; | ||||
519 | } | ||||
520 | |||||
521 | ## the second more complicated method of escaping to bigger charsets | ||||
522 | return $left if $left+2 >= $right; | ||||
523 | |||||
524 | my $esc_char_2 = substr($$str_ref, $left+2, 1); | ||||
525 | my $esc_chars = $esc_char_1 . $esc_char_2; | ||||
526 | |||||
527 | if ($esc_char_1 eq SINGLE_G0_A | ||||
528 | or $esc_char_1 eq SINGLE_G0_B) | ||||
529 | { | ||||
530 | $G0 = $esc_char_2; | ||||
531 | return $left+3; | ||||
532 | } | ||||
533 | |||||
534 | elsif ($esc_char_1 eq SINGLE_G1_A | ||||
535 | or $esc_char_1 eq SINGLE_G1_B) | ||||
536 | { | ||||
537 | $G1 = $esc_char_2; | ||||
538 | return $left+3; | ||||
539 | } | ||||
540 | |||||
541 | elsif ( $esc_char_1 eq MULTI_G0_A ) { | ||||
542 | $G0 = $esc_char_2; | ||||
543 | return $left+3; | ||||
544 | } | ||||
545 | |||||
546 | elsif ($esc_chars eq MULTI_G0_B | ||||
547 | and ($left+3 < $right)) | ||||
548 | { | ||||
549 | $G0 = substr($$str_ref, $left+3, 1); | ||||
550 | return $left+4; | ||||
551 | } | ||||
552 | |||||
553 | elsif (($esc_chars eq MULTI_G1_A or $esc_chars eq MULTI_G1_B) | ||||
554 | and ($left + 3 < $right)) | ||||
555 | { | ||||
556 | $G1 = substr($$str_ref, $left+3, 1); | ||||
557 | return $left+4; | ||||
558 | } | ||||
559 | |||||
560 | # we should never get here | ||||
561 | warn("seem to have fallen through in _process_escape()"); | ||||
562 | return $left; | ||||
563 | } | ||||
564 | |||||
565 | sub reset_charsets | ||||
566 | { | ||||
567 | $G0 = $DEFAULT_G0; | ||||
568 | $G1 = $DEFAULT_G1; | ||||
569 | } | ||||
570 | |||||
571 | 1 | 10µs | 1; |