| Filename | /usr/share/perl5/MARC/Charset.pm |
| Statements | Executed 26 statements in 4.42ms |
| Calls | P | F | Exclusive Time |
Inclusive Time |
Subroutine |
|---|---|---|---|---|---|
| 1 | 1 | 1 | 1.59ms | 13.1ms | MARC::Charset::BEGIN@14 |
| 1 | 1 | 1 | 1.40ms | 2.31ms | MARC::Charset::BEGIN@11 |
| 1 | 1 | 1 | 1.04ms | 25.9ms | MARC::Charset::BEGIN@13 |
| 1 | 1 | 1 | 11µs | 22µs | MARC::Charset::BEGIN@3 |
| 1 | 1 | 1 | 8µs | 28µs | MARC::Charset::BEGIN@12 |
| 1 | 1 | 1 | 8µs | 170µs | MARC::Charset::BEGIN@15 |
| 1 | 1 | 1 | 7µs | 12µs | MARC::Charset::BEGIN@4 |
| 1 | 1 | 1 | 7µs | 64µs | MARC::Charset::BEGIN@8 |
| 0 | 0 | 0 | 0s | 0s | MARC::Charset::_process_escape |
| 0 | 0 | 0 | 0s | 0s | MARC::Charset::assume_encoding |
| 0 | 0 | 0 | 0s | 0s | MARC::Charset::assume_unicode |
| 0 | 0 | 0 | 0s | 0s | MARC::Charset::ignore_errors |
| 0 | 0 | 0 | 0s | 0s | MARC::Charset::marc8_to_utf8 |
| 0 | 0 | 0 | 0s | 0s | MARC::Charset::reset_charsets |
| 0 | 0 | 0 | 0s | 0s | MARC::Charset::utf8_to_marc8 |
| Line | State ments |
Time on line |
Calls | Time in subs |
Code |
|---|---|---|---|---|---|
| 1 | package MARC::Charset; | ||||
| 2 | |||||
| 3 | 2 | 22µs | 2 | 34µs | # spent 22µs (11+12) within MARC::Charset::BEGIN@3 which was called:
# once (11µs+12µs) by MARC::File::XML::BEGIN@11 at line 3 # spent 22µs making 1 call to MARC::Charset::BEGIN@3
# spent 12µs making 1 call to strict::import |
| 4 | 2 | 31µs | 2 | 16µs | # spent 12µs (7+5) within MARC::Charset::BEGIN@4 which was called:
# once (7µs+5µs) by MARC::File::XML::BEGIN@11 at line 4 # spent 12µs making 1 call to MARC::Charset::BEGIN@4
# spent 5µs making 1 call to warnings::import |
| 5 | |||||
| 6 | 1 | 700ns | our $VERSION = '1.35'; | ||
| 7 | |||||
| 8 | 2 | 33µs | 2 | 122µs | # spent 64µs (7+58) within MARC::Charset::BEGIN@8 which was called:
# once (7µs+58µs) by MARC::File::XML::BEGIN@11 at line 8 # spent 64µs making 1 call to MARC::Charset::BEGIN@8
# spent 58µs making 1 call to base::import |
| 9 | 1 | 1µs | our @EXPORT_OK = qw(marc8_to_utf8 utf8_to_marc8); | ||
| 10 | |||||
| 11 | 2 | 755µs | 2 | 2.34ms | # spent 2.31ms (1.40+902µs) within MARC::Charset::BEGIN@11 which was called:
# once (1.40ms+902µs) by MARC::File::XML::BEGIN@11 at line 11 # spent 2.31ms making 1 call to MARC::Charset::BEGIN@11
# spent 28µs making 1 call to Exporter::import |
| 12 | 2 | 24µs | 2 | 47µs | # spent 28µs (8+20) within MARC::Charset::BEGIN@12 which was called:
# once (8µs+20µs) by MARC::File::XML::BEGIN@11 at line 12 # spent 28µs making 1 call to MARC::Charset::BEGIN@12
# spent 20µs making 1 call to Exporter::import |
| 13 | 2 | 665µs | 2 | 26.0ms | # spent 25.9ms (1.04+24.9) within MARC::Charset::BEGIN@13 which was called:
# once (1.04ms+24.9ms) by MARC::File::XML::BEGIN@11 at line 13 # spent 25.9ms making 1 call to MARC::Charset::BEGIN@13
# spent 144µs making 1 call to charnames::import |
| 14 | 2 | 1.07ms | 1 | 13.1ms | # spent 13.1ms (1.59+11.6) within MARC::Charset::BEGIN@14 which was called:
# once (1.59ms+11.6ms) by MARC::File::XML::BEGIN@11 at line 14 # spent 13.1ms making 1 call to MARC::Charset::BEGIN@14 |
| 15 | 2 | 1.79ms | 2 | 332µs | # spent 170µs (8+162) within MARC::Charset::BEGIN@15 which was called:
# once (8µs+162µs) by MARC::File::XML::BEGIN@11 at line 15 # spent 170µs making 1 call to MARC::Charset::BEGIN@15
# spent 162µs making 1 call to Exporter::import |
| 16 | |||||
| 17 | =head1 NAME | ||||
| 18 | |||||
| 19 | MARC::Charset - convert MARC-8 encoded strings to UTF-8 | ||||
| 20 | |||||
| 21 | =head1 SYNOPSIS | ||||
| 22 | |||||
| 23 | # import the marc8_to_utf8 function | ||||
| 24 | use MARC::Charset 'marc8_to_utf8'; | ||||
| 25 | |||||
| 26 | # prepare STDOUT for utf8 | ||||
| 27 | binmode(STDOUT, 'utf8'); | ||||
| 28 | |||||
| 29 | # print out some marc8 as utf8 | ||||
| 30 | print marc8_to_utf8($marc8_string); | ||||
| 31 | |||||
| 32 | =head1 DESCRIPTION | ||||
| 33 | |||||
| 34 | MARC::Charset allows you to turn MARC-8 encoded strings into UTF-8 | ||||
| 35 | strings. MARC-8 is a single byte character encoding that predates unicode, and | ||||
| 36 | allows you to put non-Roman scripts in MARC bibliographic records. | ||||
| 37 | |||||
| 38 | http://www.loc.gov/marc/specifications/spechome.html | ||||
| 39 | |||||
| 40 | =head1 EXPORTS | ||||
| 41 | |||||
| 42 | =cut | ||||
| 43 | |||||
| 44 | # get the mapping table | ||||
| 45 | 1 | 2µs | 1 | 69µs | our $table = MARC::Charset::Table->new(); # spent 69µs making 1 call to MARC::Charset::Table::new |
| 46 | |||||
| 47 | # set default character sets | ||||
| 48 | # these are viewable at the package level | ||||
| 49 | # in case someone wants to set them | ||||
| 50 | 1 | 300ns | our $DEFAULT_G0 = ASCII_DEFAULT; | ||
| 51 | 1 | 100ns | our $DEFAULT_G1 = EXTENDED_LATIN; | ||
| 52 | |||||
| 53 | 1 | 23µs | our %SPECIAL_DECOMPOSABLE = ( | ||
| 54 | chr(0x01a0) => chr(0x01a0), # uppercase o-hook | ||||
| 55 | chr(0x01af) => chr(0x01af), # uppercase u-hook | ||||
| 56 | chr(0x01a1) => chr(0x01a1), # lowercase o-hook | ||||
| 57 | chr(0x01b0) => chr(0x01b0), # lowercase u-hook | ||||
| 58 | chr(0x1ef1) => chr(0x01b0) . chr(0x0323), # lowercase u-hook with dot below | ||||
| 59 | chr(0x1ee9) => chr(0x01b0) . chr(0x0301), # lowercase u-hook with acute | ||||
| 60 | # Arabic to not decompose | ||||
| 61 | chr(0x0622) => chr(0x0622), | ||||
| 62 | chr(0x0623) => chr(0x0623), | ||||
| 63 | chr(0x0624) => chr(0x0624), | ||||
| 64 | chr(0x0625) => chr(0x0625), | ||||
| 65 | chr(0x0626) => chr(0x0626), | ||||
| 66 | chr(0x0649) => chr(0x0649), | ||||
| 67 | chr(0x0671) => chr(0x0671), | ||||
| 68 | chr(0x06c0) => chr(0x06c0), | ||||
| 69 | chr(0x06D3) => chr(0x06D3), | ||||
| 70 | # Cyrillic to not decompose | ||||
| 71 | chr(0x0439) => chr(0x0439), | ||||
| 72 | chr(0x0419) => chr(0x0419), | ||||
| 73 | chr(0x0453) => chr(0x0453), | ||||
| 74 | chr(0x0451) => chr(0x0451), | ||||
| 75 | chr(0x0457) => chr(0x0457), | ||||
| 76 | chr(0x045C) => chr(0x045C), | ||||
| 77 | chr(0x045E) => chr(0x045E), | ||||
| 78 | chr(0x0403) => chr(0x0403), | ||||
| 79 | chr(0x0401) => chr(0x0401), | ||||
| 80 | chr(0x0407) => chr(0x0407), | ||||
| 81 | chr(0x040C) => chr(0x040C), | ||||
| 82 | chr(0x040E) => chr(0x040E), | ||||
| 83 | # Katakana to not decompose | ||||
| 84 | chr(0x309B) => chr(0x309B), | ||||
| 85 | chr(0x309C) => chr(0x309C), | ||||
| 86 | chr(0x30AC) => chr(0x30AC), | ||||
| 87 | chr(0x30AE) => chr(0x30AE), | ||||
| 88 | chr(0x30B0) => chr(0x30B0), | ||||
| 89 | chr(0x30B2) => chr(0x30B2), | ||||
| 90 | chr(0x30B4) => chr(0x30B4), | ||||
| 91 | chr(0x30B6) => chr(0x30B6), | ||||
| 92 | chr(0x30B8) => chr(0x30B8), | ||||
| 93 | chr(0x30BA) => chr(0x30BA), | ||||
| 94 | chr(0x30BC) => chr(0x30BC), | ||||
| 95 | chr(0x30BE) => chr(0x30BE), | ||||
| 96 | chr(0x30C0) => chr(0x30C0), | ||||
| 97 | chr(0x30C2) => chr(0x30C2), | ||||
| 98 | chr(0x30C5) => chr(0x30C5), | ||||
| 99 | chr(0x30C7) => chr(0x30C7), | ||||
| 100 | chr(0x30C9) => chr(0x30C9), | ||||
| 101 | chr(0x30D0) => chr(0x30D0), | ||||
| 102 | chr(0x30D1) => chr(0x30D1), | ||||
| 103 | chr(0x30D3) => chr(0x30D3), | ||||
| 104 | chr(0x30D4) => chr(0x30D4), | ||||
| 105 | chr(0x30D6) => chr(0x30D6), | ||||
| 106 | chr(0x30D7) => chr(0x30D7), | ||||
| 107 | chr(0x30D9) => chr(0x30D9), | ||||
| 108 | chr(0x30DA) => chr(0x30DA), | ||||
| 109 | chr(0x30DC) => chr(0x30DC), | ||||
| 110 | chr(0x30DD) => chr(0x30DD), | ||||
| 111 | chr(0x30F4) => chr(0x30F4), | ||||
| 112 | chr(0x30F7) => chr(0x30F7), | ||||
| 113 | chr(0x30F8) => chr(0x30F8), | ||||
| 114 | chr(0x30F9) => chr(0x30F9), | ||||
| 115 | chr(0x30FA) => chr(0x30FA), | ||||
| 116 | chr(0x30FE) => chr(0x30FE), | ||||
| 117 | chr(0x30FF) => chr(0x30FF), | ||||
| 118 | ); | ||||
| 119 | |||||
| 120 | =head2 ignore_errors() | ||||
| 121 | |||||
| 122 | Tells MARC::Charset whether or not to ignore all encoding errors, and | ||||
| 123 | returns the current setting. This is helpful if you have records that | ||||
| 124 | contain both MARC8 and UNICODE characters. | ||||
| 125 | |||||
| 126 | my $ignore = MARC::Charset->ignore_errors(); | ||||
| 127 | |||||
| 128 | MARC::Charset->ignore_errors(1); # ignore errors | ||||
| 129 | MARC::Charset->ignore_errors(0); # DO NOT ignore errors | ||||
| 130 | |||||
| 131 | =cut | ||||
| 132 | |||||
| 133 | |||||
| 134 | 1 | 200ns | our $_ignore_errors = 0; | ||
| 135 | sub ignore_errors { | ||||
| 136 | my ($self,$i) = @_; | ||||
| 137 | $_ignore_errors = $i if (defined($i)); | ||||
| 138 | return $_ignore_errors; | ||||
| 139 | } | ||||
| 140 | |||||
| 141 | |||||
| 142 | =head2 assume_unicode() | ||||
| 143 | |||||
| 144 | Tells MARC::Charset whether or not to assume UNICODE when an error is | ||||
| 145 | encountered in ignore_errors mode and returns the current setting. | ||||
| 146 | This is helpful if you have records that contain both MARC8 and UNICODE | ||||
| 147 | characters. | ||||
| 148 | |||||
| 149 | my $setting = MARC::Charset->assume_unicode(); | ||||
| 150 | |||||
| 151 | MARC::Charset->assume_unicode(1); # assume characters are unicode (utf-8) | ||||
| 152 | MARC::Charset->assume_unicode(0); # DO NOT assume characters are unicode | ||||
| 153 | |||||
| 154 | =cut | ||||
| 155 | |||||
| 156 | |||||
| 157 | 1 | 100ns | our $_assume = ''; | ||
| 158 | sub assume_unicode { | ||||
| 159 | my ($self,$i) = @_; | ||||
| 160 | $_assume = 'utf8' if (defined($i) and $i); | ||||
| 161 | return 1 if ($_assume eq 'utf8'); | ||||
| 162 | } | ||||
| 163 | |||||
| 164 | |||||
| 165 | =head2 assume_encoding() | ||||
| 166 | |||||
| 167 | Tells MARC::Charset whether or not to assume a specific encoding when an error | ||||
| 168 | is encountered in ignore_errors mode and returns the current setting. This | ||||
| 169 | is helpful if you have records that contain both MARC8 and other characters. | ||||
| 170 | |||||
| 171 | my $setting = MARC::Charset->assume_encoding(); | ||||
| 172 | |||||
| 173 | MARC::Charset->assume_encoding('cp850'); # assume characters are cp850 | ||||
| 174 | MARC::Charset->assume_encoding(''); # DO NOT assume any encoding | ||||
| 175 | |||||
| 176 | =cut | ||||
| 177 | |||||
| 178 | |||||
| 179 | sub assume_encoding { | ||||
| 180 | my ($self,$i) = @_; | ||||
| 181 | $_assume = $i if (defined($i)); | ||||
| 182 | return $_assume; | ||||
| 183 | } | ||||
| 184 | |||||
| 185 | |||||
| 186 | # place holders for working graphical character sets | ||||
| 187 | 1 | 100ns | my $G0; | ||
| 188 | my $G1; | ||||
| 189 | |||||
| 190 | =head2 marc8_to_utf8() | ||||
| 191 | |||||
| 192 | Converts a MARC-8 encoded string to UTF-8. | ||||
| 193 | |||||
| 194 | my $utf8 = marc8_to_utf8($marc8); | ||||
| 195 | |||||
| 196 | If you'd like to ignore errors pass in a true value as the 2nd | ||||
| 197 | parameter or call MARC::Charset->ignore_errors() with a true | ||||
| 198 | value: | ||||
| 199 | |||||
| 200 | my $utf8 = marc8_to_utf8($marc8, 'ignore-errors'); | ||||
| 201 | |||||
| 202 | or | ||||
| 203 | |||||
| 204 | MARC::Charset->ignore_errors(1); | ||||
| 205 | my $utf8 = marc8_to_utf8($marc8); | ||||
| 206 | |||||
| 207 | =cut | ||||
| 208 | |||||
| 209 | |||||
| 210 | sub marc8_to_utf8 | ||||
| 211 | { | ||||
| 212 | my ($marc8, $ignore_errors) = @_; | ||||
| 213 | reset_charsets(); | ||||
| 214 | |||||
| 215 | $ignore_errors = $_ignore_errors if (!defined($ignore_errors)); | ||||
| 216 | |||||
| 217 | # holder for our utf8 | ||||
| 218 | my $utf8 = ''; | ||||
| 219 | |||||
| 220 | my $index = 0; | ||||
| 221 | my $length = length($marc8); | ||||
| 222 | my $combining = ''; | ||||
| 223 | CHAR_LOOP: while ($index < $length) | ||||
| 224 | { | ||||
| 225 | # whitespace, line feeds and carriage returns just get added on unmolested | ||||
| 226 | if (substr($marc8, $index, 1) =~ m/(\s+|\x0A+|\x0D+)/so) | ||||
| 227 | { | ||||
| 228 | $utf8 .= $1; | ||||
| 229 | $index += 1; | ||||
| 230 | next CHAR_LOOP; | ||||
| 231 | } | ||||
| 232 | |||||
| 233 | # look for any escape sequences | ||||
| 234 | my $new_index = _process_escape(\$marc8, $index, $length); | ||||
| 235 | if ($new_index > $index) | ||||
| 236 | { | ||||
| 237 | $index = $new_index; | ||||
| 238 | next CHAR_LOOP; | ||||
| 239 | } | ||||
| 240 | |||||
| 241 | my $found; | ||||
| 242 | CHARSET_LOOP: foreach my $charset ($G0, $G1) | ||||
| 243 | { | ||||
| 244 | |||||
| 245 | # cjk characters are a string of three chars | ||||
| 246 | my $char_size = $charset eq CJK ? 3 : 1; | ||||
| 247 | |||||
| 248 | # extract the next code point to examine | ||||
| 249 | my $chunk = substr($marc8, $index, $char_size); | ||||
| 250 | |||||
| 251 | my $code; | ||||
| 252 | if ($char_size == 1) { | ||||
| 253 | my $codepoint = ord($chunk); | ||||
| 254 | if ($codepoint >= 0x21 && $codepoint <= 0x7e) { | ||||
| 255 | # character is G0 | ||||
| 256 | $code = $table->lookup_by_marc8($G0, $chunk); | ||||
| 257 | } elsif ($codepoint >= 0xa1 && $codepoint <= 0xfe) { | ||||
| 258 | # character is G1, map it to G0 before atttempting lookup | ||||
| 259 | $code = $table->lookup_by_marc8($G1, chr($codepoint - 128)); | ||||
| 260 | } elsif ($codepoint >= 0x88 && $codepoint <= 0x8e) { | ||||
| 261 | # in the C1 range used by MARC8 | ||||
| 262 | $code = $table->lookup_by_marc8(EXTENDED_LATIN, $chunk); | ||||
| 263 | } elsif ($codepoint >= 0x1b && $codepoint <= 0x1f) { | ||||
| 264 | # in the C0 range used by MARC8 | ||||
| 265 | $code = $table->lookup_by_marc8(BASIC_LATIN, $chunk); | ||||
| 266 | } | ||||
| 267 | } else { | ||||
| 268 | # EACC doesn't need G0/G1 conversion | ||||
| 269 | $code = $table->lookup_by_marc8($charset, $chunk); | ||||
| 270 | } | ||||
| 271 | |||||
| 272 | # try the next character set if no mapping was found | ||||
| 273 | next CHARSET_LOOP if ! $code; | ||||
| 274 | $found = 1; | ||||
| 275 | |||||
| 276 | # gobble up all combining characters for appending later | ||||
| 277 | # this is necessary because combinging characters precede | ||||
| 278 | # the character they modify in MARC-8, whereas they follow | ||||
| 279 | # the character they modify in UTF-8. | ||||
| 280 | if ($code->is_combining()) | ||||
| 281 | { | ||||
| 282 | # If the current character is the right half of a MARC-8 | ||||
| 283 | # ligature or double tilde, we don't want to include | ||||
| 284 | # it in the UTF-8 output. For the explanation, see | ||||
| 285 | # http://lcweb2.loc.gov/diglib/codetables/45.html#Note1 | ||||
| 286 | # Note that if the MARC-8 string includes a right half | ||||
| 287 | # without the corresponding left half, the right half will | ||||
| 288 | # get dropped instead of being mapped to its UCS alternate. | ||||
| 289 | # That's OK since including only one half of a double diacritic | ||||
| 290 | # was presumably a mistake to begin with. | ||||
| 291 | unless (defined $code->marc_left_half()) | ||||
| 292 | { | ||||
| 293 | $combining .= $code->char_value(); | ||||
| 294 | } | ||||
| 295 | } | ||||
| 296 | else | ||||
| 297 | { | ||||
| 298 | $utf8 .= $code->char_value() . $combining; | ||||
| 299 | $combining = ''; | ||||
| 300 | } | ||||
| 301 | |||||
| 302 | $index += $char_size; | ||||
| 303 | next CHAR_LOOP; | ||||
| 304 | } | ||||
| 305 | |||||
| 306 | if (!$found) | ||||
| 307 | { | ||||
| 308 | warn(sprintf("no mapping found for [0x\%X] at position $index in $marc8 ". | ||||
| 309 | "g0=".MARC::Charset::Constants::charset_name($G0) . " " . | ||||
| 310 | "g1=".MARC::Charset::Constants::charset_name($G1), unpack('C',substr($marc8,$index,1)))); | ||||
| 311 | if (!$ignore_errors) | ||||
| 312 | { | ||||
| 313 | reset_charsets(); | ||||
| 314 | return; | ||||
| 315 | } | ||||
| 316 | if ($_assume) | ||||
| 317 | { | ||||
| 318 | reset_charsets(); | ||||
| 319 | return NFC(decode($_assume => $marc8)); | ||||
| 320 | } | ||||
| 321 | $index += 1; | ||||
| 322 | } | ||||
| 323 | |||||
| 324 | } | ||||
| 325 | |||||
| 326 | # return the utf8 | ||||
| 327 | reset_charsets(); | ||||
| 328 | utf8::upgrade($utf8); | ||||
| 329 | return $utf8; | ||||
| 330 | } | ||||
| 331 | |||||
| - - | |||||
| 334 | =head2 utf8_to_marc8() | ||||
| 335 | |||||
| 336 | Will attempt to translate utf8 into marc8. | ||||
| 337 | |||||
| 338 | my $marc8 = utf8_to_marc8($utf8); | ||||
| 339 | |||||
| 340 | If you'd like to ignore errors, or characters that can't be | ||||
| 341 | converted to marc8 then pass in a true value as the second | ||||
| 342 | parameter: | ||||
| 343 | |||||
| 344 | my $marc8 = utf8_to_marc8($utf8, 'ignore-errors'); | ||||
| 345 | |||||
| 346 | or | ||||
| 347 | |||||
| 348 | MARC::Charset->ignore_errors(1); | ||||
| 349 | my $utf8 = marc8_to_utf8($marc8); | ||||
| 350 | |||||
| 351 | =cut | ||||
| 352 | |||||
| 353 | sub utf8_to_marc8 | ||||
| 354 | { | ||||
| 355 | my ($utf8, $ignore_errors) = @_; | ||||
| 356 | reset_charsets(); | ||||
| 357 | |||||
| 358 | $ignore_errors = $_ignore_errors if (!defined($ignore_errors)); | ||||
| 359 | |||||
| 360 | # decompose combined characters | ||||
| 361 | $utf8 = join('', | ||||
| 362 | map { exists $SPECIAL_DECOMPOSABLE{$_} ? $SPECIAL_DECOMPOSABLE{$_} : NFD($_) } | ||||
| 363 | split //, $utf8 | ||||
| 364 | ); | ||||
| 365 | |||||
| 366 | my $len = length($utf8); | ||||
| 367 | my $marc8 = ''; | ||||
| 368 | for (my $i=0; $i<$len; $i++) | ||||
| 369 | { | ||||
| 370 | my $slice = substr($utf8, $i, 1); | ||||
| 371 | |||||
| 372 | # spaces are copied from utf8 into marc8 | ||||
| 373 | if ($slice eq ' ') | ||||
| 374 | { | ||||
| 375 | $marc8 .= ' '; | ||||
| 376 | next; | ||||
| 377 | } | ||||
| 378 | |||||
| 379 | # try to find the code point in our mapping table | ||||
| 380 | my $code = $table->lookup_by_utf8($slice); | ||||
| 381 | |||||
| 382 | if (! $code) | ||||
| 383 | { | ||||
| 384 | warn("no mapping found at position $i in $utf8"); | ||||
| 385 | reset_charsets() and return unless $ignore_errors; | ||||
| 386 | } | ||||
| 387 | |||||
| 388 | # if it's a combining character move it around | ||||
| 389 | if ($code->is_combining()) | ||||
| 390 | { | ||||
| 391 | my $prev = chop($marc8); | ||||
| 392 | if ($code->marc_left_half()) | ||||
| 393 | { | ||||
| 394 | # don't add the MARC-8 right half character | ||||
| 395 | # if it was already inserted when the double | ||||
| 396 | # diacritic was converted from UTF-8 | ||||
| 397 | if ($code->marc_value() eq substr($marc8, -1, 1)) | ||||
| 398 | { | ||||
| 399 | $marc8 .= $prev; | ||||
| 400 | next; | ||||
| 401 | } | ||||
| 402 | } | ||||
| 403 | $marc8 .= $code->marc_value() . $prev; | ||||
| 404 | if ($code->marc_right_half()) | ||||
| 405 | { | ||||
| 406 | $marc8 .= chr(hex($code->marc_right_half())); | ||||
| 407 | } | ||||
| 408 | next; | ||||
| 409 | } | ||||
| 410 | |||||
| 411 | # look to see if we need to escape to a new G0 charset | ||||
| 412 | my $charset_value = $code->charset_value(); | ||||
| 413 | |||||
| 414 | if ($code->default_charset_group() eq 'G0' | ||||
| 415 | and $G0 ne $charset_value) | ||||
| 416 | { | ||||
| 417 | if ($G0 eq ASCII_DEFAULT and $charset_value eq BASIC_LATIN) | ||||
| 418 | { | ||||
| 419 | # don't bother escaping, they're functionally the same | ||||
| 420 | } | ||||
| 421 | else | ||||
| 422 | { | ||||
| 423 | $marc8 .= $code->get_escape(); | ||||
| 424 | $G0 = $charset_value; | ||||
| 425 | } | ||||
| 426 | } | ||||
| 427 | |||||
| 428 | # look to see if we need to escape to a new G1 charset | ||||
| 429 | elsif ($code->default_charset_group() eq 'G1' | ||||
| 430 | and $G1 ne $charset_value) | ||||
| 431 | { | ||||
| 432 | $marc8 .= $code->get_escape(); | ||||
| 433 | $G1 = $charset_value; | ||||
| 434 | } | ||||
| 435 | |||||
| 436 | $marc8 .= $code->marc_value(); | ||||
| 437 | } | ||||
| 438 | |||||
| 439 | # escape back to default G0 if necessary | ||||
| 440 | if ($G0 ne $DEFAULT_G0) | ||||
| 441 | { | ||||
| 442 | if ($DEFAULT_G0 eq ASCII_DEFAULT) { $marc8 .= ESCAPE . ASCII_DEFAULT; } | ||||
| 443 | elsif ($DEFAULT_G0 eq CJK) { $marc8 .= ESCAPE . MULTI_G0_A . CJK; } | ||||
| 444 | else { $marc8 .= ESCAPE . SINGLE_G0_A . $DEFAULT_G0; } | ||||
| 445 | } | ||||
| 446 | |||||
| 447 | # escape back to default G1 if necessary | ||||
| 448 | if ($G1 ne $DEFAULT_G1) | ||||
| 449 | { | ||||
| 450 | if ($DEFAULT_G1 eq CJK) { $marc8 .= ESCAPE . MULTI_G1_A . $DEFAULT_G1; } | ||||
| 451 | else { $marc8 .= ESCAPE . SINGLE_G1_A . $DEFAULT_G1; } | ||||
| 452 | } | ||||
| 453 | |||||
| 454 | return $marc8; | ||||
| 455 | } | ||||
| 456 | |||||
| - - | |||||
| 459 | =head1 DEFAULT CHARACTER SETS | ||||
| 460 | |||||
| 461 | If you need to alter the default character sets you can set the | ||||
| 462 | $MARC::Charset::DEFAULT_G0 and $MARC::Charset::DEFAULT_G1 variables to the | ||||
| 463 | appropriate character set code: | ||||
| 464 | |||||
| 465 | use MARC::Charset::Constants qw(:all); | ||||
| 466 | $MARC::Charset::DEFAULT_G0 = BASIC_ARABIC; | ||||
| 467 | $MARC::Charset::DEFAULT_G1 = EXTENDED_ARABIC; | ||||
| 468 | |||||
| 469 | =head1 SEE ALSO | ||||
| 470 | |||||
| 471 | =over 4 | ||||
| 472 | |||||
| 473 | =item * L<MARC::Charset::Constant> | ||||
| 474 | |||||
| 475 | =item * L<MARC::Charset::Table> | ||||
| 476 | |||||
| 477 | =item * L<MARC::Charset::Code> | ||||
| 478 | |||||
| 479 | =item * L<MARC::Charset::Compiler> | ||||
| 480 | |||||
| 481 | =item * L<MARC::Record> | ||||
| 482 | |||||
| 483 | =item * L<MARC::XML> | ||||
| 484 | |||||
| 485 | =back | ||||
| 486 | |||||
| 487 | =head1 AUTHOR | ||||
| 488 | |||||
| 489 | Ed Summers (ehs@pobox.com) | ||||
| 490 | |||||
| 491 | =cut | ||||
| 492 | |||||
| 493 | |||||
| 494 | sub _process_escape | ||||
| 495 | { | ||||
| 496 | ## this stuff is kind of scary ... for an explanation of what is | ||||
| 497 | ## going on here check out the MARC-8 specs at LC. | ||||
| 498 | ## http://lcweb.loc.gov/marc/specifications/speccharmarc8.html | ||||
| 499 | my ($str_ref, $left, $right) = @_; | ||||
| 500 | |||||
| 501 | # first char needs to be an escape or else this isn't an escape sequence | ||||
| 502 | return $left unless substr($$str_ref, $left, 1) eq ESCAPE; | ||||
| 503 | |||||
| 504 | ## if we don't have at least one character after the escape | ||||
| 505 | ## then this can't be a character escape sequence | ||||
| 506 | return $left if ($left+1 >= $right); | ||||
| 507 | |||||
| 508 | ## pull off the first escape | ||||
| 509 | my $esc_char_1 = substr($$str_ref, $left+1, 1); | ||||
| 510 | |||||
| 511 | ## the first method of escaping to small character sets | ||||
| 512 | if ( $esc_char_1 eq GREEK_SYMBOLS | ||||
| 513 | or $esc_char_1 eq SUBSCRIPTS | ||||
| 514 | or $esc_char_1 eq SUPERSCRIPTS | ||||
| 515 | or $esc_char_1 eq ASCII_DEFAULT) | ||||
| 516 | { | ||||
| 517 | $G0 = $esc_char_1; | ||||
| 518 | return $left+2; | ||||
| 519 | } | ||||
| 520 | |||||
| 521 | ## the second more complicated method of escaping to bigger charsets | ||||
| 522 | return $left if $left+2 >= $right; | ||||
| 523 | |||||
| 524 | my $esc_char_2 = substr($$str_ref, $left+2, 1); | ||||
| 525 | my $esc_chars = $esc_char_1 . $esc_char_2; | ||||
| 526 | |||||
| 527 | if ($esc_char_1 eq SINGLE_G0_A | ||||
| 528 | or $esc_char_1 eq SINGLE_G0_B) | ||||
| 529 | { | ||||
| 530 | $G0 = $esc_char_2; | ||||
| 531 | return $left+3; | ||||
| 532 | } | ||||
| 533 | |||||
| 534 | elsif ($esc_char_1 eq SINGLE_G1_A | ||||
| 535 | or $esc_char_1 eq SINGLE_G1_B) | ||||
| 536 | { | ||||
| 537 | $G1 = $esc_char_2; | ||||
| 538 | return $left+3; | ||||
| 539 | } | ||||
| 540 | |||||
| 541 | elsif ( $esc_char_1 eq MULTI_G0_A ) { | ||||
| 542 | $G0 = $esc_char_2; | ||||
| 543 | return $left+3; | ||||
| 544 | } | ||||
| 545 | |||||
| 546 | elsif ($esc_chars eq MULTI_G0_B | ||||
| 547 | and ($left+3 < $right)) | ||||
| 548 | { | ||||
| 549 | $G0 = substr($$str_ref, $left+3, 1); | ||||
| 550 | return $left+4; | ||||
| 551 | } | ||||
| 552 | |||||
| 553 | elsif (($esc_chars eq MULTI_G1_A or $esc_chars eq MULTI_G1_B) | ||||
| 554 | and ($left + 3 < $right)) | ||||
| 555 | { | ||||
| 556 | $G1 = substr($$str_ref, $left+3, 1); | ||||
| 557 | return $left+4; | ||||
| 558 | } | ||||
| 559 | |||||
| 560 | # we should never get here | ||||
| 561 | warn("seem to have fallen through in _process_escape()"); | ||||
| 562 | return $left; | ||||
| 563 | } | ||||
| 564 | |||||
| 565 | sub reset_charsets | ||||
| 566 | { | ||||
| 567 | $G0 = $DEFAULT_G0; | ||||
| 568 | $G1 = $DEFAULT_G1; | ||||
| 569 | } | ||||
| 570 | |||||
| 571 | 1 | 10µs | 1; |