Filename | /mnt/catalyst/koha/C4/Charset.pm |
Statements | Executed 355 statements in 4.46ms |
Calls | P | F | Exclusive Time |
Inclusive Time |
Subroutine |
---|---|---|---|---|---|
1 | 1 | 1 | 2.32ms | 2.99ms | BEGIN@24 | C4::Charset::
1 | 1 | 1 | 434µs | 448µs | BEGIN@20 | C4::Charset::
1 | 1 | 1 | 10µs | 13µs | BEGIN@25 | C4::Charset::
1 | 1 | 1 | 9µs | 38µs | BEGIN@23 | C4::Charset::
1 | 1 | 1 | 9µs | 9µs | BEGIN@31 | C4::Charset::
1 | 1 | 1 | 9µs | 41µs | BEGIN@27 | C4::Charset::
1 | 1 | 1 | 8µs | 72µs | BEGIN@26 | C4::Charset::
1 | 1 | 1 | 7µs | 14µs | BEGIN@21 | C4::Charset::
1 | 1 | 1 | 6µs | 50µs | BEGIN@29 | C4::Charset::
0 | 0 | 0 | 0s | 0s | IsStringUTF8ish | C4::Charset::
0 | 0 | 0 | 0s | 0s | MarcToUTF8Record | C4::Charset::
0 | 0 | 0 | 0s | 0s | NormalizeString | C4::Charset::
0 | 0 | 0 | 0s | 0s | SetMarcUnicodeFlag | C4::Charset::
0 | 0 | 0 | 0s | 0s | SetUTF8Flag | C4::Charset::
0 | 0 | 0 | 0s | 0s | StripNonXmlChars | C4::Charset::
0 | 0 | 0 | 0s | 0s | __ANON__[:546] | C4::Charset::
0 | 0 | 0 | 0s | 0s | _default_marc21_charconv_to_utf8 | C4::Charset::
0 | 0 | 0 | 0s | 0s | _default_unimarc_charconv_to_utf8 | C4::Charset::
0 | 0 | 0 | 0s | 0s | _marc_iso5426_to_utf8 | C4::Charset::
0 | 0 | 0 | 0s | 0s | _marc_marc8_to_utf8 | C4::Charset::
0 | 0 | 0 | 0s | 0s | _marc_to_utf8_replacement_char | C4::Charset::
0 | 0 | 0 | 0s | 0s | _marc_to_utf8_via_text_iconv | C4::Charset::
0 | 0 | 0 | 0s | 0s | char_decode5426 | C4::Charset::
0 | 0 | 0 | 0s | 0s | nsb_clean | C4::Charset::
Line | State ments |
Time on line |
Calls | Time in subs |
Code |
---|---|---|---|---|---|
1 | package C4::Charset; | ||||
2 | |||||
3 | # Copyright (C) 2008 LibLime | ||||
4 | # | ||||
5 | # This file is part of Koha. | ||||
6 | # | ||||
7 | # Koha is free software; you can redistribute it and/or modify it under the | ||||
8 | # terms of the GNU General Public License as published by the Free Software | ||||
9 | # Foundation; either version 2 of the License, or (at your option) any later | ||||
10 | # version. | ||||
11 | # | ||||
12 | # Koha is distributed in the hope that it will be useful, but WITHOUT ANY | ||||
13 | # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR | ||||
14 | # A PARTICULAR PURPOSE. See the GNU General Public License for more details. | ||||
15 | # | ||||
16 | # You should have received a copy of the GNU General Public License along | ||||
17 | # with Koha; if not, write to the Free Software Foundation, Inc., | ||||
18 | # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
19 | |||||
20 | 2 | 28µs | 2 | 462µs | # spent 448µs (434+14) within C4::Charset::BEGIN@20 which was called:
# once (434µs+14µs) by C4::Biblio::BEGIN@37 at line 20 # spent 448µs making 1 call to C4::Charset::BEGIN@20
# spent 14µs making 1 call to strict::import |
21 | 2 | 24µs | 2 | 20µs | # spent 14µs (7+6) within C4::Charset::BEGIN@21 which was called:
# once (7µs+6µs) by C4::Biblio::BEGIN@37 at line 21 # spent 14µs making 1 call to C4::Charset::BEGIN@21
# spent 6µs making 1 call to warnings::import |
22 | |||||
23 | 2 | 25µs | 2 | 67µs | # spent 38µs (9+29) within C4::Charset::BEGIN@23 which was called:
# once (9µs+29µs) by C4::Biblio::BEGIN@37 at line 23 # spent 38µs making 1 call to C4::Charset::BEGIN@23
# spent 29µs making 1 call to Exporter::import |
24 | 2 | 679µs | 2 | 3.00ms | # spent 2.99ms (2.32+670µs) within C4::Charset::BEGIN@24 which was called:
# once (2.32ms+670µs) by C4::Biblio::BEGIN@37 at line 24 # spent 2.99ms making 1 call to C4::Charset::BEGIN@24
# spent 15µs making 1 call to Exporter::import |
25 | 2 | 22µs | 2 | 15µs | # spent 13µs (10+3) within C4::Charset::BEGIN@25 which was called:
# once (10µs+3µs) by C4::Biblio::BEGIN@37 at line 25 # spent 13µs making 1 call to C4::Charset::BEGIN@25
# spent 3µs making 1 call to C4::Context::import |
26 | 2 | 25µs | 2 | 136µs | # spent 72µs (8+64) within C4::Charset::BEGIN@26 which was called:
# once (8µs+64µs) by C4::Biblio::BEGIN@37 at line 26 # spent 72µs making 1 call to C4::Charset::BEGIN@26
# spent 64µs making 1 call to Exporter::import |
27 | 2 | 27µs | 2 | 73µs | # spent 41µs (9+32) within C4::Charset::BEGIN@27 which was called:
# once (9µs+32µs) by C4::Biblio::BEGIN@37 at line 27 # spent 41µs making 1 call to C4::Charset::BEGIN@27
# spent 32µs making 1 call to Exporter::import |
28 | |||||
29 | 2 | 47µs | 2 | 94µs | # spent 50µs (6+44) within C4::Charset::BEGIN@29 which was called:
# once (6µs+44µs) by C4::Biblio::BEGIN@37 at line 29 # spent 50µs making 1 call to C4::Charset::BEGIN@29
# spent 44µs making 1 call to vars::import |
30 | |||||
31 | # spent 9µs within C4::Charset::BEGIN@31 which was called:
# once (9µs+0s) by C4::Biblio::BEGIN@37 at line 45 | ||||
32 | # set the version for version checking | ||||
33 | 1 | 800ns | $VERSION = 3.07.00.049; | ||
34 | 1 | 400ns | require Exporter; | ||
35 | 1 | 5µs | @ISA = qw(Exporter); | ||
36 | 1 | 4µs | @EXPORT = qw( | ||
37 | NormalizeString | ||||
38 | IsStringUTF8ish | ||||
39 | MarcToUTF8Record | ||||
40 | SetUTF8Flag | ||||
41 | SetMarcUnicodeFlag | ||||
42 | StripNonXmlChars | ||||
43 | nsb_clean | ||||
44 | ); | ||||
45 | 1 | 3.47ms | 1 | 9µs | } # spent 9µs making 1 call to C4::Charset::BEGIN@31 |
46 | |||||
47 | =head1 NAME | ||||
48 | |||||
49 | C4::Charset - utilities for handling character set conversions. | ||||
50 | |||||
51 | =head1 SYNOPSIS | ||||
52 | |||||
53 | use C4::Charset; | ||||
54 | |||||
55 | =head1 DESCRIPTION | ||||
56 | |||||
57 | This module contains routines for dealing with character set | ||||
58 | conversions, particularly for MARC records. | ||||
59 | |||||
60 | A variety of character encodings are in use by various MARC | ||||
61 | standards, and even more character encodings are used by | ||||
62 | non-standard MARC records. The various MARC formats generally | ||||
63 | do not do a good job of advertising a given record's character | ||||
64 | encoding, and even when a record does advertise its encoding, | ||||
65 | e.g., via the Leader/09, experience has shown that one cannot | ||||
66 | trust it. | ||||
67 | |||||
68 | Ultimately, all MARC records are stored in Koha in UTF-8 and | ||||
69 | must be converted from whatever the source character encoding is. | ||||
70 | The goal of this module is to ensure that these conversions | ||||
71 | take place accurately. When a character conversion cannot take | ||||
72 | place, or at least not accurately, the module was provide | ||||
73 | enough information to allow user-facing code to inform the user | ||||
74 | on how to deal with the situation. | ||||
75 | |||||
76 | =cut | ||||
77 | |||||
78 | =head1 FUNCTIONS | ||||
79 | |||||
80 | =head2 IsStringUTF8ish | ||||
81 | |||||
82 | my $is_utf8 = IsStringUTF8ish($str); | ||||
83 | |||||
84 | Determines if C<$str> is valid UTF-8. This can mean | ||||
85 | one of two things: | ||||
86 | |||||
87 | =over | ||||
88 | |||||
89 | =item * | ||||
90 | |||||
91 | The Perl UTF-8 flag is set and the string contains valid UTF-8. | ||||
92 | |||||
93 | =item * | ||||
94 | |||||
95 | The Perl UTF-8 flag is B<not> set, but the octets contain | ||||
96 | valid UTF-8. | ||||
97 | |||||
98 | =back | ||||
99 | |||||
100 | The function is named C<IsStringUTF8ish> instead of C<IsStringUTF8> | ||||
101 | because in one could be presented with a MARC blob that is | ||||
102 | not actually in UTF-8 but whose sequence of octets appears to be | ||||
103 | valid UTF-8. The rest of the MARC character conversion functions | ||||
104 | will assume that this situation occur does not very often. | ||||
105 | |||||
106 | =cut | ||||
107 | |||||
108 | sub IsStringUTF8ish { | ||||
109 | my $str = shift; | ||||
110 | |||||
111 | return 1 if utf8::is_utf8($str); | ||||
112 | return utf8::decode($str); | ||||
113 | } | ||||
114 | |||||
115 | =head2 SetUTF8Flag | ||||
116 | |||||
117 | my $marc_record = SetUTF8Flag($marc_record, $nfd); | ||||
118 | |||||
119 | This function sets the PERL UTF8 flag for data. | ||||
120 | It is required when using new_from_usmarc | ||||
121 | since MARC::File::USMARC does not handle PERL UTF8 setting. | ||||
122 | When editing unicode marc records fields and subfields, you | ||||
123 | would end up in double encoding without using this function. | ||||
124 | |||||
125 | If $nfd is set, string normalization will use NFD instead of NFC | ||||
126 | |||||
127 | FIXME | ||||
128 | In my opinion, this function belongs to MARC::Record and not | ||||
129 | to this package. | ||||
130 | But since it handles charset, and MARC::Record, it finds its way in that package | ||||
131 | |||||
132 | =cut | ||||
133 | |||||
134 | sub SetUTF8Flag{ | ||||
135 | my ($record, $nfd)=@_; | ||||
136 | return unless ($record && $record->fields()); | ||||
137 | foreach my $field ($record->fields()){ | ||||
138 | if ($field->tag()>=10){ | ||||
139 | my @subfields; | ||||
140 | foreach my $subfield ($field->subfields()){ | ||||
141 | push @subfields,($$subfield[0],NormalizeString($$subfield[1],$nfd)); | ||||
142 | } | ||||
143 | eval { | ||||
144 | my $newfield=MARC::Field->new( | ||||
145 | $field->tag(), | ||||
146 | $field->indicator(1), | ||||
147 | $field->indicator(2), | ||||
148 | @subfields | ||||
149 | ); | ||||
150 | $field->replace_with($newfield); | ||||
151 | }; | ||||
152 | warn "ERROR occurred in SetUTF8Flag $@" if $@; | ||||
153 | } | ||||
154 | } | ||||
155 | } | ||||
156 | |||||
157 | =head2 NormalizeString | ||||
158 | |||||
159 | my $normalized_string=NormalizeString($string,$nfd,$transform); | ||||
160 | |||||
161 | Given a string | ||||
162 | nfd : If you want to set NFD and not NFC | ||||
163 | transform : If you expect all the signs to be removed | ||||
164 | |||||
165 | Sets the PERL UTF8 Flag on your initial data if need be | ||||
166 | and applies cleaning if required | ||||
167 | |||||
168 | Returns a utf8 NFC normalized string | ||||
169 | |||||
170 | Sample code : | ||||
171 | my $string=NormalizeString ("l'ornithoptère"); | ||||
172 | #results into ornithoptère in NFC form and sets UTF8 Flag | ||||
173 | |||||
174 | =cut | ||||
175 | |||||
176 | |||||
177 | sub NormalizeString{ | ||||
178 | my ($string,$nfd,$transform)=@_; | ||||
179 | utf8::decode($string) unless (utf8::is_utf8($string)); | ||||
180 | if ($nfd){ | ||||
181 | $string= NFD($string); | ||||
182 | } | ||||
183 | else { | ||||
184 | $string=NFC($string); | ||||
185 | } | ||||
186 | if ($transform){ | ||||
187 | $string=~s/\<|\>|\^|\;|\.|\?|,|\-|\(|\)|\[|\]|\{|\}|\$|\%|\!|\*|\:|\\|\/|\&|\"|\'/ /g; | ||||
188 | #removing one letter words "d'" "l'" was changed into "d " "l " | ||||
189 | $string=~s/\b\S\b//g; | ||||
190 | $string=~s/\s+$//g; | ||||
191 | } | ||||
192 | return $string; | ||||
193 | } | ||||
194 | |||||
195 | =head2 MarcToUTF8Record | ||||
196 | |||||
197 | ($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob, | ||||
198 | $marc_flavour, [, $source_encoding]); | ||||
199 | |||||
200 | Given a MARC blob or a C<MARC::Record>, the MARC flavour, and an | ||||
201 | optional source encoding, return a C<MARC::Record> that is | ||||
202 | converted to UTF-8. | ||||
203 | |||||
204 | The returned C<$marc_record> is guaranteed to be in valid UTF-8, but | ||||
205 | is not guaranteed to have been converted correctly. Specifically, | ||||
206 | if C<$converted_from> is 'failed', the MARC record returned failed | ||||
207 | character conversion and had each of its non-ASCII octets changed | ||||
208 | to the Unicode replacement character. | ||||
209 | |||||
210 | If the source encoding was not specified, this routine will | ||||
211 | try to guess it; the character encoding used for a successful | ||||
212 | conversion is returned in C<$converted_from>. | ||||
213 | |||||
214 | =cut | ||||
215 | |||||
216 | sub MarcToUTF8Record { | ||||
217 | my $marc = shift; | ||||
218 | my $marc_flavour = shift; | ||||
219 | my $source_encoding = shift; | ||||
220 | my $marc_record; | ||||
221 | my $marc_blob_is_utf8 = 0; | ||||
222 | if (ref($marc) eq 'MARC::Record') { | ||||
223 | my $marc_blob = $marc->as_usmarc(); | ||||
224 | $marc_blob_is_utf8 = IsStringUTF8ish($marc_blob); | ||||
225 | $marc_record = $marc; | ||||
226 | } else { | ||||
227 | # dealing with a MARC blob | ||||
228 | |||||
229 | # remove any ersatz whitespace from the beginning and | ||||
230 | # end of the MARC blob -- these can creep into MARC | ||||
231 | # files produced by several sources -- caller really | ||||
232 | # should be doing this, however | ||||
233 | $marc =~ s/^\s+//; | ||||
234 | $marc =~ s/\s+$//; | ||||
235 | $marc_blob_is_utf8 = IsStringUTF8ish($marc); | ||||
236 | eval { | ||||
237 | $marc_record = MARC::Record->new_from_usmarc($marc); | ||||
238 | }; | ||||
239 | if ($@) { | ||||
240 | # if we fail the first time, one likely problem | ||||
241 | # is that we have a MARC21 record that says that it's | ||||
242 | # UTF-8 (Leader/09 = 'a') but contains non-UTF-8 characters. | ||||
243 | # We'll try parsing it again. | ||||
244 | substr($marc, 9, 1) = ' '; | ||||
245 | eval { | ||||
246 | $marc_record = MARC::Record->new_from_usmarc($marc); | ||||
247 | }; | ||||
248 | if ($@) { | ||||
249 | # it's hopeless; return an empty MARC::Record | ||||
250 | return MARC::Record->new(), 'failed', ['could not parse MARC blob']; | ||||
251 | } | ||||
252 | } | ||||
253 | } | ||||
254 | |||||
255 | # If we do not know the source encoding, try some guesses | ||||
256 | # as follows: | ||||
257 | # 1. Record is UTF-8 already. | ||||
258 | # 2. If MARC flavor is MARC21 or NORMARC, then | ||||
259 | # a. record is MARC-8 | ||||
260 | # b. record is ISO-8859-1 | ||||
261 | # 3. If MARC flavor is UNIMARC, then | ||||
262 | if (not defined $source_encoding) { | ||||
263 | if ($marc_blob_is_utf8) { | ||||
264 | # note that for MARC21/NORMARC we are not bothering to check | ||||
265 | # if the Leader/09 is set to 'a' or not -- because | ||||
266 | # of problems with various ILSs (including Koha in the | ||||
267 | # past, alas), this just is not trustworthy. | ||||
268 | SetMarcUnicodeFlag($marc_record, $marc_flavour); | ||||
269 | return $marc_record, 'UTF-8', []; | ||||
270 | } else { | ||||
271 | if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') { | ||||
272 | return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour); | ||||
273 | } elsif ($marc_flavour =~/UNIMARC/) { | ||||
274 | return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour); | ||||
275 | } else { | ||||
276 | return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour); | ||||
277 | } | ||||
278 | } | ||||
279 | } else { | ||||
280 | # caller knows the character encoding | ||||
281 | my $original_marc_record = $marc_record->clone(); | ||||
282 | my @errors; | ||||
283 | if ($source_encoding =~ /utf-?8/i) { | ||||
284 | if ($marc_blob_is_utf8) { | ||||
285 | SetMarcUnicodeFlag($marc_record, $marc_flavour); | ||||
286 | return $marc_record, 'UTF-8', []; | ||||
287 | } else { | ||||
288 | push @errors, 'specified UTF-8 => UTF-8 conversion, but record is not in UTF-8'; | ||||
289 | } | ||||
290 | } elsif ($source_encoding =~ /marc-?8/i) { | ||||
291 | @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour); | ||||
292 | } elsif ($source_encoding =~ /5426/) { | ||||
293 | @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour); | ||||
294 | } else { | ||||
295 | # assume any other character encoding is for Text::Iconv | ||||
296 | @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding); | ||||
297 | } | ||||
298 | |||||
299 | if (@errors) { | ||||
300 | _marc_to_utf8_replacement_char($original_marc_record, $marc_flavour); | ||||
301 | return $original_marc_record, 'failed', \@errors; | ||||
302 | } else { | ||||
303 | return $marc_record, $source_encoding, []; | ||||
304 | } | ||||
305 | } | ||||
306 | |||||
307 | } | ||||
308 | |||||
309 | =head2 SetMarcUnicodeFlag | ||||
310 | |||||
311 | SetMarcUnicodeFlag($marc_record, $marc_flavour); | ||||
312 | |||||
313 | Set both the internal MARC::Record encoding flag | ||||
314 | and the appropriate Leader/09 (MARC21) or | ||||
315 | 100/26-29 (UNIMARC) to indicate that the record | ||||
316 | is in UTF-8. Note that this does B<not> do | ||||
317 | any actual character conversion. | ||||
318 | |||||
319 | =cut | ||||
320 | |||||
321 | sub SetMarcUnicodeFlag { | ||||
322 | my $marc_record = shift; | ||||
323 | my $marc_flavour = shift; # || C4::Context->preference("marcflavour"); | ||||
324 | |||||
325 | $marc_record->encoding('UTF-8'); | ||||
326 | if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') { | ||||
327 | my $leader = $marc_record->leader(); | ||||
328 | substr($leader, 9, 1) = 'a'; | ||||
329 | $marc_record->leader($leader); | ||||
330 | } elsif ($marc_flavour =~/UNIMARC/) { | ||||
331 | my $defaultlanguage = C4::Context->preference("UNIMARCField100Language"); | ||||
332 | $defaultlanguage = "fre" if (!$defaultlanguage || length($defaultlanguage) != 3); | ||||
333 | my $string; | ||||
334 | my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?(21,12):(36,25)); | ||||
335 | $string=$marc_record->subfield( 100, "a" ); | ||||
336 | if (defined $string && length($string)==$subflength) { | ||||
337 | $string = substr $string, 0,$subflength if (length($string)>$subflength); | ||||
338 | } | ||||
339 | else { | ||||
340 | $string = POSIX::strftime( "%Y%m%d", localtime ); | ||||
341 | $string =~ s/\-//g; | ||||
342 | $string = sprintf( "%-*s", $subflength, $string ); | ||||
343 | substr ( $string, ($encodingposition - 3), 3, $defaultlanguage); | ||||
344 | } | ||||
345 | substr( $string, $encodingposition, 3, "y50" ); | ||||
346 | if ( $marc_record->subfield( 100, "a" ) ) { | ||||
347 | $marc_record->field('100')->update(a=>$string); | ||||
348 | } | ||||
349 | else { | ||||
350 | $marc_record->insert_grouped_field( | ||||
351 | MARC::Field->new( 100, '', '', "a" => $string ) ); | ||||
352 | } | ||||
353 | $debug && warn "encodage: ", substr( $marc_record->subfield(100, 'a'), $encodingposition, 3 ); | ||||
354 | } else { | ||||
355 | warn "Unrecognized marcflavour: $marc_flavour"; | ||||
356 | } | ||||
357 | } | ||||
358 | |||||
359 | =head2 StripNonXmlChars | ||||
360 | |||||
361 | my $new_str = StripNonXmlChars($old_str); | ||||
362 | |||||
363 | Given a string, return a copy with the | ||||
364 | characters that are illegal in XML | ||||
365 | removed. | ||||
366 | |||||
367 | This function exists to work around a problem | ||||
368 | that can occur with badly-encoded MARC records. | ||||
369 | Specifically, if a UTF-8 MARC record also | ||||
370 | has excape (\x1b) characters, MARC::File::XML | ||||
371 | will let the escape characters pass through | ||||
372 | when as_xml() or as_xml_record() is called. The | ||||
373 | problem is that the escape character is not | ||||
374 | legal in well-formed XML documents, so when | ||||
375 | MARC::File::XML attempts to parse such a record, | ||||
376 | the XML parser will fail. | ||||
377 | |||||
378 | Stripping such characters will allow a | ||||
379 | MARC::Record->new_from_xml() | ||||
380 | to work, at the possible risk of some data loss. | ||||
381 | |||||
382 | =cut | ||||
383 | |||||
384 | sub StripNonXmlChars { | ||||
385 | my $str = shift; | ||||
386 | if (!defined($str) || $str eq ""){ | ||||
387 | return ""; | ||||
388 | } | ||||
389 | $str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g; | ||||
390 | return $str; | ||||
391 | } | ||||
392 | |||||
- - | |||||
395 | =head2 nsb_clean | ||||
396 | |||||
397 | =over 4 | ||||
398 | |||||
399 | nsb_clean($string); | ||||
400 | |||||
401 | =back | ||||
402 | |||||
403 | Removes Non Sorting Block characters | ||||
404 | |||||
405 | =cut | ||||
406 | sub nsb_clean { | ||||
407 | my $NSB = '\x88' ; # NSB : begin Non Sorting Block | ||||
408 | my $NSE = '\x89' ; # NSE : Non Sorting Block end | ||||
409 | my $NSB2 = '\x98' ; # NSB : begin Non Sorting Block | ||||
410 | my $NSE2 = '\x9C' ; # NSE : Non Sorting Block end | ||||
411 | my $C2 = '\xC2' ; # What is this char ? It is sometimes left by the regexp after removing NSB / NSE | ||||
412 | |||||
413 | # handles non sorting blocks | ||||
414 | my ($string) = @_ ; | ||||
415 | $_ = $string ; | ||||
416 | s/$NSB//g ; | ||||
417 | s/$NSE//g ; | ||||
418 | s/$NSB2//g ; | ||||
419 | s/$NSE2//g ; | ||||
420 | s/$C2//g ; | ||||
421 | $string = $_ ; | ||||
422 | |||||
423 | return($string) ; | ||||
424 | } | ||||
425 | |||||
426 | |||||
427 | =head1 INTERNAL FUNCTIONS | ||||
428 | |||||
429 | =head2 _default_marc21_charconv_to_utf8 | ||||
430 | |||||
431 | my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record); | ||||
432 | |||||
433 | Converts a C<MARC::Record> of unknown character set to UTF-8, | ||||
434 | first by trying a MARC-8 to UTF-8 conversion, then ISO-8859-1 | ||||
435 | to UTF-8, then a default conversion that replaces each non-ASCII | ||||
436 | character with the replacement character. | ||||
437 | |||||
438 | The C<$guessed_charset> return value contains the character set | ||||
439 | that resulted in a conversion to valid UTF-8; note that | ||||
440 | if the MARC-8 and ISO-8859-1 conversions failed, the value of | ||||
441 | this is 'failed'. | ||||
442 | |||||
443 | =cut | ||||
444 | |||||
445 | sub _default_marc21_charconv_to_utf8 { | ||||
446 | my $marc_record = shift; | ||||
447 | my $marc_flavour = shift; | ||||
448 | |||||
449 | my $trial_marc8 = $marc_record->clone(); | ||||
450 | my @all_errors = (); | ||||
451 | my @errors = _marc_marc8_to_utf8($trial_marc8, $marc_flavour); | ||||
452 | unless (@errors) { | ||||
453 | return $trial_marc8, 'MARC-8', []; | ||||
454 | } | ||||
455 | push @all_errors, @errors; | ||||
456 | |||||
457 | my $trial_8859_1 = $marc_record->clone(); | ||||
458 | @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1'); | ||||
459 | unless (@errors) { | ||||
460 | return $trial_8859_1, 'iso-8859-1', []; # note -- we could return \@all_errors | ||||
461 | # instead if we wanted to report details | ||||
462 | # of the failed attempt at MARC-8 => UTF-8 | ||||
463 | } | ||||
464 | push @all_errors, @errors; | ||||
465 | |||||
466 | my $default_converted = $marc_record->clone(); | ||||
467 | _marc_to_utf8_replacement_char($default_converted, $marc_flavour); | ||||
468 | return $default_converted, 'failed', \@all_errors; | ||||
469 | } | ||||
470 | |||||
471 | =head2 _default_unimarc_charconv_to_utf8 | ||||
472 | |||||
473 | my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record); | ||||
474 | |||||
475 | Converts a C<MARC::Record> of unknown character set to UTF-8, | ||||
476 | first by trying a ISO-5426 to UTF-8 conversion, then ISO-8859-1 | ||||
477 | to UTF-8, then a default conversion that replaces each non-ASCII | ||||
478 | character with the replacement character. | ||||
479 | |||||
480 | The C<$guessed_charset> return value contains the character set | ||||
481 | that resulted in a conversion to valid UTF-8; note that | ||||
482 | if the MARC-8 and ISO-8859-1 conversions failed, the value of | ||||
483 | this is 'failed'. | ||||
484 | |||||
485 | =cut | ||||
486 | |||||
487 | sub _default_unimarc_charconv_to_utf8 { | ||||
488 | my $marc_record = shift; | ||||
489 | my $marc_flavour = shift; | ||||
490 | |||||
491 | my $trial_marc8 = $marc_record->clone(); | ||||
492 | my @all_errors = (); | ||||
493 | my @errors = _marc_iso5426_to_utf8($trial_marc8, $marc_flavour); | ||||
494 | unless (@errors) { | ||||
495 | return $trial_marc8, 'iso-5426'; | ||||
496 | } | ||||
497 | push @all_errors, @errors; | ||||
498 | |||||
499 | my $trial_8859_1 = $marc_record->clone(); | ||||
500 | @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1'); | ||||
501 | unless (@errors) { | ||||
502 | return $trial_8859_1, 'iso-8859-1'; | ||||
503 | } | ||||
504 | push @all_errors, @errors; | ||||
505 | |||||
506 | my $default_converted = $marc_record->clone(); | ||||
507 | _marc_to_utf8_replacement_char($default_converted, $marc_flavour); | ||||
508 | return $default_converted, 'failed', \@all_errors; | ||||
509 | } | ||||
510 | |||||
511 | =head2 _marc_marc8_to_utf8 | ||||
512 | |||||
513 | my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding); | ||||
514 | |||||
515 | Convert a C<MARC::Record> to UTF-8 in-place from MARC-8. | ||||
516 | If the conversion fails for some reason, an | ||||
517 | appropriate messages will be placed in the returned | ||||
518 | C<@errors> array. | ||||
519 | |||||
520 | =cut | ||||
521 | |||||
522 | sub _marc_marc8_to_utf8 { | ||||
523 | my $marc_record = shift; | ||||
524 | my $marc_flavour = shift; | ||||
525 | |||||
526 | my $prev_ignore = MARC::Charset->ignore_errors(); | ||||
527 | MARC::Charset->ignore_errors(1); | ||||
528 | |||||
529 | # trap warnings raised by MARC::Charset | ||||
530 | my @errors = (); | ||||
531 | local $SIG{__WARN__} = sub { | ||||
532 | my $msg = $_[0]; | ||||
533 | if ($msg =~ /MARC.Charset/) { | ||||
534 | # FIXME - purpose of this regexp is to strip out the | ||||
535 | # line reference to MARC/Charset.pm, but as it | ||||
536 | # exists probably won't work quite on Windows -- | ||||
537 | # some sort of minimal-bunch back-tracking RE | ||||
538 | # would be helpful here | ||||
539 | $msg =~ s/at [\/].*?.MARC.Charset\.pm line \d+\.\n$//; | ||||
540 | push @errors, $msg; | ||||
541 | } else { | ||||
542 | # if warning doesn't come from MARC::Charset, just | ||||
543 | # pass it on | ||||
544 | warn $msg; | ||||
545 | } | ||||
546 | }; | ||||
547 | |||||
548 | foreach my $field ($marc_record->fields()) { | ||||
549 | if ($field->is_control_field()) { | ||||
550 | ; # do nothing -- control fields should not contain non-ASCII characters | ||||
551 | } else { | ||||
552 | my @converted_subfields; | ||||
553 | foreach my $subfield ($field->subfields()) { | ||||
554 | my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]); | ||||
555 | unless (IsStringUTF8ish($utf8sf)) { | ||||
556 | # Because of a bug in MARC::Charset 0.98, if the string | ||||
557 | # has (a) one or more diacritics that (b) are only in character positions | ||||
558 | # 128 to 255 inclusive, the resulting converted string is not in | ||||
559 | # UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1). If that | ||||
560 | # occurs, upgrade the string in place. Moral of the story seems to be | ||||
561 | # that pack("U", ...) is better than chr(...) if you need to guarantee | ||||
562 | # that the resulting string is UTF-8. | ||||
563 | utf8::upgrade($utf8sf); | ||||
564 | } | ||||
565 | push @converted_subfields, $subfield->[0], $utf8sf; | ||||
566 | } | ||||
567 | |||||
568 | $field->replace_with(MARC::Field->new( | ||||
569 | $field->tag(), $field->indicator(1), $field->indicator(2), | ||||
570 | @converted_subfields) | ||||
571 | ); | ||||
572 | } | ||||
573 | } | ||||
574 | |||||
575 | MARC::Charset->ignore_errors($prev_ignore); | ||||
576 | |||||
577 | SetMarcUnicodeFlag($marc_record, $marc_flavour); | ||||
578 | |||||
579 | return @errors; | ||||
580 | } | ||||
581 | |||||
582 | =head2 _marc_iso5426_to_utf8 | ||||
583 | |||||
584 | my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding); | ||||
585 | |||||
586 | Convert a C<MARC::Record> to UTF-8 in-place from ISO-5426. | ||||
587 | If the conversion fails for some reason, an | ||||
588 | appropriate messages will be placed in the returned | ||||
589 | C<@errors> array. | ||||
590 | |||||
591 | FIXME - is ISO-5426 equivalent enough to MARC-8 | ||||
592 | that C<MARC::Charset> can be used instead? | ||||
593 | |||||
594 | =cut | ||||
595 | |||||
596 | sub _marc_iso5426_to_utf8 { | ||||
597 | my $marc_record = shift; | ||||
598 | my $marc_flavour = shift; | ||||
599 | |||||
600 | my @errors = (); | ||||
601 | |||||
602 | foreach my $field ($marc_record->fields()) { | ||||
603 | if ($field->is_control_field()) { | ||||
604 | ; # do nothing -- control fields should not contain non-ASCII characters | ||||
605 | } else { | ||||
606 | my @converted_subfields; | ||||
607 | foreach my $subfield ($field->subfields()) { | ||||
608 | my $utf8sf = char_decode5426($subfield->[1]); | ||||
609 | push @converted_subfields, $subfield->[0], $utf8sf; | ||||
610 | } | ||||
611 | |||||
612 | $field->replace_with(MARC::Field->new( | ||||
613 | $field->tag(), $field->indicator(1), $field->indicator(2), | ||||
614 | @converted_subfields) | ||||
615 | ); | ||||
616 | } | ||||
617 | } | ||||
618 | |||||
619 | SetMarcUnicodeFlag($marc_record, $marc_flavour); | ||||
620 | |||||
621 | return @errors; | ||||
622 | } | ||||
623 | |||||
624 | =head2 _marc_to_utf8_via_text_iconv | ||||
625 | |||||
626 | my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding); | ||||
627 | |||||
628 | Convert a C<MARC::Record> to UTF-8 in-place using the | ||||
629 | C<Text::Iconv> CPAN module. Any source encoding accepted | ||||
630 | by the user's iconv installation should work. If | ||||
631 | the source encoding is not recognized on the user's | ||||
632 | server or the conversion fails for some reason, | ||||
633 | appropriate messages will be placed in the returned | ||||
634 | C<@errors> array. | ||||
635 | |||||
636 | =cut | ||||
637 | |||||
638 | sub _marc_to_utf8_via_text_iconv { | ||||
639 | my $marc_record = shift; | ||||
640 | my $marc_flavour = shift; | ||||
641 | my $source_encoding = shift; | ||||
642 | |||||
643 | my @errors = (); | ||||
644 | my $decoder; | ||||
645 | eval { $decoder = Text::Iconv->new($source_encoding, 'utf8'); }; | ||||
646 | if ($@) { | ||||
647 | push @errors, "Could not initialze $source_encoding => utf8 converter: $@"; | ||||
648 | return @errors; | ||||
649 | } | ||||
650 | |||||
651 | my $prev_raise_error = Text::Iconv->raise_error(); | ||||
652 | Text::Iconv->raise_error(1); | ||||
653 | |||||
654 | foreach my $field ($marc_record->fields()) { | ||||
655 | if ($field->is_control_field()) { | ||||
656 | ; # do nothing -- control fields should not contain non-ASCII characters | ||||
657 | } else { | ||||
658 | my @converted_subfields; | ||||
659 | foreach my $subfield ($field->subfields()) { | ||||
660 | my $converted_value; | ||||
661 | my $conversion_ok = 1; | ||||
662 | eval { $converted_value = $decoder->convert($subfield->[1]); }; | ||||
663 | if ($@) { | ||||
664 | $conversion_ok = 0; | ||||
665 | push @errors, $@; | ||||
666 | } elsif (not defined $converted_value) { | ||||
667 | $conversion_ok = 0; | ||||
668 | push @errors, "Text::Iconv conversion failed - retval is " . $decoder->retval(); | ||||
669 | } | ||||
670 | |||||
671 | if ($conversion_ok) { | ||||
672 | push @converted_subfields, $subfield->[0], $converted_value; | ||||
673 | } else { | ||||
674 | $converted_value = $subfield->[1]; | ||||
675 | $converted_value =~ s/[\200-\377]/\xef\xbf\xbd/g; | ||||
676 | push @converted_subfields, $subfield->[0], $converted_value; | ||||
677 | } | ||||
678 | } | ||||
679 | |||||
680 | $field->replace_with(MARC::Field->new( | ||||
681 | $field->tag(), $field->indicator(1), $field->indicator(2), | ||||
682 | @converted_subfields) | ||||
683 | ); | ||||
684 | } | ||||
685 | } | ||||
686 | |||||
687 | SetMarcUnicodeFlag($marc_record, $marc_flavour); | ||||
688 | Text::Iconv->raise_error($prev_raise_error); | ||||
689 | |||||
690 | return @errors; | ||||
691 | } | ||||
692 | |||||
693 | =head2 _marc_to_utf8_replacement_char | ||||
694 | |||||
695 | _marc_to_utf8_replacement_char($marc_record, $marc_flavour); | ||||
696 | |||||
697 | Convert a C<MARC::Record> to UTF-8 in-place, adopting the | ||||
698 | unsatisfactory method of replacing all non-ASCII (e.g., | ||||
699 | where the eight bit is set) octet with the Unicode | ||||
700 | replacement character. This is meant as a last-ditch | ||||
701 | method, and would be best used as part of a UI that | ||||
702 | lets a cataloguer pick various character conversions | ||||
703 | until he or she finds the right one. | ||||
704 | |||||
705 | =cut | ||||
706 | |||||
707 | sub _marc_to_utf8_replacement_char { | ||||
708 | my $marc_record = shift; | ||||
709 | my $marc_flavour = shift; | ||||
710 | |||||
711 | foreach my $field ($marc_record->fields()) { | ||||
712 | if ($field->is_control_field()) { | ||||
713 | ; # do nothing -- control fields should not contain non-ASCII characters | ||||
714 | } else { | ||||
715 | my @converted_subfields; | ||||
716 | foreach my $subfield ($field->subfields()) { | ||||
717 | my $value = $subfield->[1]; | ||||
718 | $value =~ s/[\200-\377]/\xef\xbf\xbd/g; | ||||
719 | push @converted_subfields, $subfield->[0], $value; | ||||
720 | } | ||||
721 | |||||
722 | $field->replace_with(MARC::Field->new( | ||||
723 | $field->tag(), $field->indicator(1), $field->indicator(2), | ||||
724 | @converted_subfields) | ||||
725 | ); | ||||
726 | } | ||||
727 | } | ||||
728 | |||||
729 | SetMarcUnicodeFlag($marc_record, $marc_flavour); | ||||
730 | } | ||||
731 | |||||
732 | =head2 char_decode5426 | ||||
733 | |||||
734 | my $utf8string = char_decode5426($iso_5426_string); | ||||
735 | |||||
736 | Converts a string from ISO-5426 to UTF-8. | ||||
737 | |||||
738 | =cut | ||||
739 | |||||
740 | |||||
741 | 1 | 300ns | my %chars; | ||
742 | 1 | 1µs | $chars{0xb0}=0x0101;#3/0ayn[ain] | ||
743 | 1 | 200ns | $chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove] | ||
744 | #$chars{0xb2}=0x00e0;#'Ã '; | ||||
745 | 1 | 100ns | $chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark | ||
746 | #$chars{0xb3}=0x00e7;#'ç'; | ||||
747 | 1 | 100ns | $chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark | ||
748 | # $chars{0xb4}='è'; | ||||
749 | 1 | 200ns | $chars{0xb4}=0x00e8; | ||
750 | 1 | 200ns | $chars{0xbd}=0x02b9; | ||
751 | 1 | 200ns | $chars{0xbe}=0x02ba; | ||
752 | # $chars{0xb5}='é'; | ||||
753 | 1 | 600ns | $chars{0xb5}=0x00e9; | ||
754 | 1 | 200ns | $chars{0x97}=0x003c;#3/2leftlowsinglequotationmark | ||
755 | 1 | 100ns | $chars{0x98}=0x003e;#3/2leftlowsinglequotationmark | ||
756 | 1 | 200ns | $chars{0xfa}=0x0153; #oe | ||
757 | 1 | 100ns | $chars{0xea}=0x0152; #oe | ||
758 | 1 | 200ns | $chars{0x81d1}=0x00b0; | ||
759 | |||||
760 | #### | ||||
761 | ## combined characters iso5426 | ||||
762 | |||||
763 | 1 | 200ns | $chars{0xc041}=0x1ea2; # capital a with hook above | ||
764 | 1 | 200ns | $chars{0xc045}=0x1eba; # capital e with hook above | ||
765 | 1 | 800ns | $chars{0xc049}=0x1ec8; # capital i with hook above | ||
766 | 1 | 100ns | $chars{0xc04f}=0x1ece; # capital o with hook above | ||
767 | 1 | 200ns | $chars{0xc055}=0x1ee6; # capital u with hook above | ||
768 | 1 | 200ns | $chars{0xc059}=0x1ef6; # capital y with hook above | ||
769 | 1 | 200ns | $chars{0xc061}=0x1ea3; # small a with hook above | ||
770 | 1 | 100ns | $chars{0xc065}=0x1ebb; # small e with hook above | ||
771 | 1 | 100ns | $chars{0xc069}=0x1ec9; # small i with hook above | ||
772 | 1 | 100ns | $chars{0xc06f}=0x1ecf; # small o with hook above | ||
773 | 1 | 100ns | $chars{0xc075}=0x1ee7; # small u with hook above | ||
774 | 1 | 200ns | $chars{0xc079}=0x1ef7; # small y with hook above | ||
775 | |||||
776 | # 4/1 grave accent | ||||
777 | 1 | 200ns | $chars{0xc141}=0x00c0; # capital a with grave accent | ||
778 | 1 | 100ns | $chars{0xc145}=0x00c8; # capital e with grave accent | ||
779 | 1 | 200ns | $chars{0xc149}=0x00cc; # capital i with grave accent | ||
780 | 1 | 100ns | $chars{0xc14f}=0x00d2; # capital o with grave accent | ||
781 | 1 | 200ns | $chars{0xc155}=0x00d9; # capital u with grave accent | ||
782 | 1 | 100ns | $chars{0xc157}=0x1e80; # capital w with grave | ||
783 | 1 | 3µs | $chars{0xc159}=0x1ef2; # capital y with grave | ||
784 | 1 | 100ns | $chars{0xc161}=0x00e0; # small a with grave accent | ||
785 | 1 | 200ns | $chars{0xc165}=0x00e8; # small e with grave accent | ||
786 | 1 | 200ns | $chars{0xc169}=0x00ec; # small i with grave accent | ||
787 | 1 | 100ns | $chars{0xc16f}=0x00f2; # small o with grave accent | ||
788 | 1 | 100ns | $chars{0xc175}=0x00f9; # small u with grave accent | ||
789 | 1 | 100ns | $chars{0xc177}=0x1e81; # small w with grave | ||
790 | 1 | 200ns | $chars{0xc179}=0x1ef3; # small y with grave | ||
791 | # 4/2 acute accent | ||||
792 | 1 | 100ns | $chars{0xc241}=0x00c1; # capital a with acute accent | ||
793 | 1 | 300ns | $chars{0xc243}=0x0106; # capital c with acute accent | ||
794 | 1 | 100ns | $chars{0xc245}=0x00c9; # capital e with acute accent | ||
795 | 1 | 200ns | $chars{0xc247}=0x01f4; # capital g with acute | ||
796 | 1 | 100ns | $chars{0xc249}=0x00cd; # capital i with acute accent | ||
797 | 1 | 100ns | $chars{0xc24b}=0x1e30; # capital k with acute | ||
798 | 1 | 100ns | $chars{0xc24c}=0x0139; # capital l with acute accent | ||
799 | 1 | 200ns | $chars{0xc24d}=0x1e3e; # capital m with acute | ||
800 | 1 | 100ns | $chars{0xc24e}=0x0143; # capital n with acute accent | ||
801 | 1 | 100ns | $chars{0xc24f}=0x00d3; # capital o with acute accent | ||
802 | 1 | 100ns | $chars{0xc250}=0x1e54; # capital p with acute | ||
803 | 1 | 200ns | $chars{0xc252}=0x0154; # capital r with acute accent | ||
804 | 1 | 100ns | $chars{0xc253}=0x015a; # capital s with acute accent | ||
805 | 1 | 200ns | $chars{0xc255}=0x00da; # capital u with acute accent | ||
806 | 1 | 100ns | $chars{0xc257}=0x1e82; # capital w with acute | ||
807 | 1 | 100ns | $chars{0xc259}=0x00dd; # capital y with acute accent | ||
808 | 1 | 100ns | $chars{0xc25a}=0x0179; # capital z with acute accent | ||
809 | 1 | 200ns | $chars{0xc261}=0x00e1; # small a with acute accent | ||
810 | 1 | 200ns | $chars{0xc263}=0x0107; # small c with acute accent | ||
811 | 1 | 100ns | $chars{0xc265}=0x00e9; # small e with acute accent | ||
812 | 1 | 200ns | $chars{0xc267}=0x01f5; # small g with acute | ||
813 | 1 | 100ns | $chars{0xc269}=0x00ed; # small i with acute accent | ||
814 | 1 | 100ns | $chars{0xc26b}=0x1e31; # small k with acute | ||
815 | 1 | 100ns | $chars{0xc26c}=0x013a; # small l with acute accent | ||
816 | 1 | 1µs | $chars{0xc26d}=0x1e3f; # small m with acute | ||
817 | 1 | 100ns | $chars{0xc26e}=0x0144; # small n with acute accent | ||
818 | 1 | 200ns | $chars{0xc26f}=0x00f3; # small o with acute accent | ||
819 | 1 | 100ns | $chars{0xc270}=0x1e55; # small p with acute | ||
820 | 1 | 100ns | $chars{0xc272}=0x0155; # small r with acute accent | ||
821 | 1 | 100ns | $chars{0xc273}=0x015b; # small s with acute accent | ||
822 | 1 | 100ns | $chars{0xc275}=0x00fa; # small u with acute accent | ||
823 | 1 | 100ns | $chars{0xc277}=0x1e83; # small w with acute | ||
824 | 1 | 100ns | $chars{0xc279}=0x00fd; # small y with acute accent | ||
825 | 1 | 200ns | $chars{0xc27a}=0x017a; # small z with acute accent | ||
826 | 1 | 100ns | $chars{0xc2e1}=0x01fc; # capital ae with acute | ||
827 | 1 | 200ns | $chars{0xc2f1}=0x01fd; # small ae with acute | ||
828 | # 4/3 circumflex accent | ||||
829 | 1 | 200ns | $chars{0xc341}=0x00c2; # capital a with circumflex accent | ||
830 | 1 | 200ns | $chars{0xc343}=0x0108; # capital c with circumflex | ||
831 | 1 | 200ns | $chars{0xc345}=0x00ca; # capital e with circumflex accent | ||
832 | 1 | 100ns | $chars{0xc347}=0x011c; # capital g with circumflex | ||
833 | 1 | 200ns | $chars{0xc348}=0x0124; # capital h with circumflex | ||
834 | 1 | 100ns | $chars{0xc349}=0x00ce; # capital i with circumflex accent | ||
835 | 1 | 200ns | $chars{0xc34a}=0x0134; # capital j with circumflex | ||
836 | 1 | 100ns | $chars{0xc34f}=0x00d4; # capital o with circumflex accent | ||
837 | 1 | 200ns | $chars{0xc353}=0x015c; # capital s with circumflex | ||
838 | 1 | 100ns | $chars{0xc355}=0x00db; # capital u with circumflex | ||
839 | 1 | 200ns | $chars{0xc357}=0x0174; # capital w with circumflex | ||
840 | 1 | 100ns | $chars{0xc359}=0x0176; # capital y with circumflex | ||
841 | 1 | 100ns | $chars{0xc35a}=0x1e90; # capital z with circumflex | ||
842 | 1 | 100ns | $chars{0xc361}=0x00e2; # small a with circumflex accent | ||
843 | 1 | 200ns | $chars{0xc363}=0x0109; # small c with circumflex | ||
844 | 1 | 100ns | $chars{0xc365}=0x00ea; # small e with circumflex accent | ||
845 | 1 | 200ns | $chars{0xc367}=0x011d; # small g with circumflex | ||
846 | 1 | 100ns | $chars{0xc368}=0x0125; # small h with circumflex | ||
847 | 1 | 200ns | $chars{0xc369}=0x00ee; # small i with circumflex accent | ||
848 | 1 | 100ns | $chars{0xc36a}=0x0135; # small j with circumflex | ||
849 | 1 | 100ns | $chars{0xc36e}=0x00f1; # small n with tilde | ||
850 | 1 | 200ns | $chars{0xc36f}=0x00f4; # small o with circumflex accent | ||
851 | 1 | 200ns | $chars{0xc373}=0x015d; # small s with circumflex | ||
852 | 1 | 100ns | $chars{0xc375}=0x00fb; # small u with circumflex | ||
853 | 1 | 200ns | $chars{0xc377}=0x0175; # small w with circumflex | ||
854 | 1 | 200ns | $chars{0xc379}=0x0177; # small y with circumflex | ||
855 | 1 | 200ns | $chars{0xc37a}=0x1e91; # small z with circumflex | ||
856 | # 4/4 tilde | ||||
857 | 1 | 200ns | $chars{0xc441}=0x00c3; # capital a with tilde | ||
858 | 1 | 100ns | $chars{0xc445}=0x1ebc; # capital e with tilde | ||
859 | 1 | 200ns | $chars{0xc449}=0x0128; # capital i with tilde | ||
860 | 1 | 100ns | $chars{0xc44e}=0x00d1; # capital n with tilde | ||
861 | 1 | 200ns | $chars{0xc44f}=0x00d5; # capital o with tilde | ||
862 | 1 | 100ns | $chars{0xc455}=0x0168; # capital u with tilde | ||
863 | 1 | 200ns | $chars{0xc456}=0x1e7c; # capital v with tilde | ||
864 | 1 | 100ns | $chars{0xc459}=0x1ef8; # capital y with tilde | ||
865 | 1 | 200ns | $chars{0xc461}=0x00e3; # small a with tilde | ||
866 | 1 | 100ns | $chars{0xc465}=0x1ebd; # small e with tilde | ||
867 | 1 | 200ns | $chars{0xc469}=0x0129; # small i with tilde | ||
868 | 1 | 200ns | $chars{0xc46e}=0x00f1; # small n with tilde | ||
869 | 1 | 100ns | $chars{0xc46f}=0x00f5; # small o with tilde | ||
870 | 1 | 200ns | $chars{0xc475}=0x0169; # small u with tilde | ||
871 | 1 | 200ns | $chars{0xc476}=0x1e7d; # small v with tilde | ||
872 | 1 | 100ns | $chars{0xc479}=0x1ef9; # small y with tilde | ||
873 | # 4/5 macron | ||||
874 | 1 | 200ns | $chars{0xc541}=0x0100; # capital a with macron | ||
875 | 1 | 200ns | $chars{0xc545}=0x0112; # capital e with macron | ||
876 | 1 | 100ns | $chars{0xc547}=0x1e20; # capital g with macron | ||
877 | 1 | 200ns | $chars{0xc549}=0x012a; # capital i with macron | ||
878 | 1 | 200ns | $chars{0xc54f}=0x014c; # capital o with macron | ||
879 | 1 | 100ns | $chars{0xc555}=0x016a; # capital u with macron | ||
880 | 1 | 200ns | $chars{0xc561}=0x0101; # small a with macron | ||
881 | 1 | 500ns | $chars{0xc565}=0x0113; # small e with macron | ||
882 | 1 | 200ns | $chars{0xc567}=0x1e21; # small g with macron | ||
883 | 1 | 2µs | $chars{0xc569}=0x012b; # small i with macron | ||
884 | 1 | 200ns | $chars{0xc56f}=0x014d; # small o with macron | ||
885 | 1 | 200ns | $chars{0xc575}=0x016b; # small u with macron | ||
886 | 1 | 200ns | $chars{0xc572}=0x0159; # small r with macron | ||
887 | 1 | 100ns | $chars{0xc5e1}=0x01e2; # capital ae with macron | ||
888 | 1 | 200ns | $chars{0xc5f1}=0x01e3; # small ae with macron | ||
889 | # 4/6 breve | ||||
890 | 1 | 100ns | $chars{0xc641}=0x0102; # capital a with breve | ||
891 | 1 | 200ns | $chars{0xc645}=0x0114; # capital e with breve | ||
892 | 1 | 100ns | $chars{0xc647}=0x011e; # capital g with breve | ||
893 | 1 | 200ns | $chars{0xc649}=0x012c; # capital i with breve | ||
894 | 1 | 100ns | $chars{0xc64f}=0x014e; # capital o with breve | ||
895 | 1 | 200ns | $chars{0xc655}=0x016c; # capital u with breve | ||
896 | 1 | 100ns | $chars{0xc661}=0x0103; # small a with breve | ||
897 | 1 | 200ns | $chars{0xc665}=0x0115; # small e with breve | ||
898 | 1 | 100ns | $chars{0xc667}=0x011f; # small g with breve | ||
899 | 1 | 100ns | $chars{0xc669}=0x012d; # small i with breve | ||
900 | 1 | 100ns | $chars{0xc66f}=0x014f; # small o with breve | ||
901 | 1 | 200ns | $chars{0xc675}=0x016d; # small u with breve | ||
902 | # 4/7 dot above | ||||
903 | 1 | 100ns | $chars{0xc7b0}=0x01e1; # Ain with dot above | ||
904 | 1 | 200ns | $chars{0xc742}=0x1e02; # capital b with dot above | ||
905 | 1 | 200ns | $chars{0xc743}=0x010a; # capital c with dot above | ||
906 | 1 | 100ns | $chars{0xc744}=0x1e0a; # capital d with dot above | ||
907 | 1 | 200ns | $chars{0xc745}=0x0116; # capital e with dot above | ||
908 | 1 | 100ns | $chars{0xc746}=0x1e1e; # capital f with dot above | ||
909 | 1 | 200ns | $chars{0xc747}=0x0120; # capital g with dot above | ||
910 | 1 | 200ns | $chars{0xc748}=0x1e22; # capital h with dot above | ||
911 | 1 | 100ns | $chars{0xc749}=0x0130; # capital i with dot above | ||
912 | 1 | 200ns | $chars{0xc74d}=0x1e40; # capital m with dot above | ||
913 | 1 | 200ns | $chars{0xc74e}=0x1e44; # capital n with dot above | ||
914 | 1 | 100ns | $chars{0xc750}=0x1e56; # capital p with dot above | ||
915 | 1 | 200ns | $chars{0xc752}=0x1e58; # capital r with dot above | ||
916 | 1 | 100ns | $chars{0xc753}=0x1e60; # capital s with dot above | ||
917 | 1 | 100ns | $chars{0xc754}=0x1e6a; # capital t with dot above | ||
918 | 1 | 200ns | $chars{0xc757}=0x1e86; # capital w with dot above | ||
919 | 1 | 100ns | $chars{0xc758}=0x1e8a; # capital x with dot above | ||
920 | 1 | 200ns | $chars{0xc759}=0x1e8e; # capital y with dot above | ||
921 | 1 | 200ns | $chars{0xc75a}=0x017b; # capital z with dot above | ||
922 | 1 | 200ns | $chars{0xc761}=0x0227; # small b with dot above | ||
923 | 1 | 100ns | $chars{0xc762}=0x1e03; # small b with dot above | ||
924 | 1 | 200ns | $chars{0xc763}=0x010b; # small c with dot above | ||
925 | 1 | 100ns | $chars{0xc764}=0x1e0b; # small d with dot above | ||
926 | 1 | 100ns | $chars{0xc765}=0x0117; # small e with dot above | ||
927 | 1 | 100ns | $chars{0xc766}=0x1e1f; # small f with dot above | ||
928 | 1 | 200ns | $chars{0xc767}=0x0121; # small g with dot above | ||
929 | 1 | 100ns | $chars{0xc768}=0x1e23; # small h with dot above | ||
930 | 1 | 100ns | $chars{0xc76d}=0x1e41; # small m with dot above | ||
931 | 1 | 100ns | $chars{0xc76e}=0x1e45; # small n with dot above | ||
932 | 1 | 100ns | $chars{0xc770}=0x1e57; # small p with dot above | ||
933 | 1 | 100ns | $chars{0xc772}=0x1e59; # small r with dot above | ||
934 | 1 | 200ns | $chars{0xc773}=0x1e61; # small s with dot above | ||
935 | 1 | 100ns | $chars{0xc774}=0x1e6b; # small t with dot above | ||
936 | 1 | 200ns | $chars{0xc777}=0x1e87; # small w with dot above | ||
937 | 1 | 100ns | $chars{0xc778}=0x1e8b; # small x with dot above | ||
938 | 1 | 100ns | $chars{0xc779}=0x1e8f; # small y with dot above | ||
939 | 1 | 200ns | $chars{0xc77a}=0x017c; # small z with dot above | ||
940 | # 4/8 trema, diaresis | ||||
941 | 1 | 100ns | $chars{0xc820}=0x00a8; # diaeresis | ||
942 | 1 | 200ns | $chars{0xc841}=0x00c4; # capital a with diaeresis | ||
943 | 1 | 200ns | $chars{0xc845}=0x00cb; # capital e with diaeresis | ||
944 | 1 | 100ns | $chars{0xc848}=0x1e26; # capital h with diaeresis | ||
945 | 1 | 200ns | $chars{0xc849}=0x00cf; # capital i with diaeresis | ||
946 | 1 | 100ns | $chars{0xc84f}=0x00d6; # capital o with diaeresis | ||
947 | 1 | 600ns | $chars{0xc855}=0x00dc; # capital u with diaeresis | ||
948 | 1 | 200ns | $chars{0xc857}=0x1e84; # capital w with diaeresis | ||
949 | 1 | 100ns | $chars{0xc858}=0x1e8c; # capital x with diaeresis | ||
950 | 1 | 100ns | $chars{0xc859}=0x0178; # capital y with diaeresis | ||
951 | 1 | 200ns | $chars{0xc861}=0x00e4; # small a with diaeresis | ||
952 | 1 | 100ns | $chars{0xc865}=0x00eb; # small e with diaeresis | ||
953 | 1 | 200ns | $chars{0xc868}=0x1e27; # small h with diaeresis | ||
954 | 1 | 200ns | $chars{0xc869}=0x00ef; # small i with diaeresis | ||
955 | 1 | 100ns | $chars{0xc86f}=0x00f6; # small o with diaeresis | ||
956 | 1 | 100ns | $chars{0xc874}=0x1e97; # small t with diaeresis | ||
957 | 1 | 200ns | $chars{0xc875}=0x00fc; # small u with diaeresis | ||
958 | 1 | 200ns | $chars{0xc877}=0x1e85; # small w with diaeresis | ||
959 | 1 | 200ns | $chars{0xc878}=0x1e8d; # small x with diaeresis | ||
960 | 1 | 2µs | $chars{0xc879}=0x00ff; # small y with diaeresis | ||
961 | # 4/9 umlaut | ||||
962 | 1 | 200ns | $chars{0xc920}=0x00a8; # [diaeresis] | ||
963 | 1 | 100ns | $chars{0xc961}=0x00e4; # a with umlaut | ||
964 | 1 | 200ns | $chars{0xc965}=0x00eb; # e with umlaut | ||
965 | 1 | 200ns | $chars{0xc969}=0x00ef; # i with umlaut | ||
966 | 1 | 100ns | $chars{0xc96f}=0x00f6; # o with umlaut | ||
967 | 1 | 100ns | $chars{0xc975}=0x00fc; # u with umlaut | ||
968 | # 4/10 circle above | ||||
969 | 1 | 100ns | $chars{0xca41}=0x00c5; # capital a with ring above | ||
970 | 1 | 200ns | $chars{0xcaad}=0x016e; # capital u with ring above | ||
971 | 1 | 100ns | $chars{0xca61}=0x00e5; # small a with ring above | ||
972 | 1 | 100ns | $chars{0xca75}=0x016f; # small u with ring above | ||
973 | 1 | 100ns | $chars{0xca77}=0x1e98; # small w with ring above | ||
974 | 1 | 100ns | $chars{0xca79}=0x1e99; # small y with ring above | ||
975 | # 4/11 high comma off centre | ||||
976 | # 4/12 inverted high comma centred | ||||
977 | # 4/13 double acute accent | ||||
978 | 1 | 100ns | $chars{0xcd4f}=0x0150; # capital o with double acute | ||
979 | 1 | 200ns | $chars{0xcd55}=0x0170; # capital u with double acute | ||
980 | 1 | 200ns | $chars{0xcd6f}=0x0151; # small o with double acute | ||
981 | 1 | 100ns | $chars{0xcd75}=0x0171; # small u with double acute | ||
982 | # 4/14 horn | ||||
983 | 1 | 100ns | $chars{0xce54}=0x01a0; # latin capital letter o with horn | ||
984 | 1 | 100ns | $chars{0xce55}=0x01af; # latin capital letter u with horn | ||
985 | 1 | 100ns | $chars{0xce74}=0x01a1; # latin small letter o with horn | ||
986 | 1 | 200ns | $chars{0xce75}=0x01b0; # latin small letter u with horn | ||
987 | # 4/15 caron (hacek | ||||
988 | 1 | 100ns | $chars{0xcf41}=0x01cd; # capital a with caron | ||
989 | 1 | 200ns | $chars{0xcf43}=0x010c; # capital c with caron | ||
990 | 1 | 100ns | $chars{0xcf44}=0x010e; # capital d with caron | ||
991 | 1 | 200ns | $chars{0xcf45}=0x011a; # capital e with caron | ||
992 | 1 | 100ns | $chars{0xcf47}=0x01e6; # capital g with caron | ||
993 | 1 | 200ns | $chars{0xcf49}=0x01cf; # capital i with caron | ||
994 | 1 | 100ns | $chars{0xcf4b}=0x01e8; # capital k with caron | ||
995 | 1 | 200ns | $chars{0xcf4c}=0x013d; # capital l with caron | ||
996 | 1 | 100ns | $chars{0xcf4e}=0x0147; # capital n with caron | ||
997 | 1 | 200ns | $chars{0xcf4f}=0x01d1; # capital o with caron | ||
998 | 1 | 100ns | $chars{0xcf52}=0x0158; # capital r with caron | ||
999 | 1 | 200ns | $chars{0xcf53}=0x0160; # capital s with caron | ||
1000 | 1 | 100ns | $chars{0xcf54}=0x0164; # capital t with caron | ||
1001 | 1 | 200ns | $chars{0xcf55}=0x01d3; # capital u with caron | ||
1002 | 1 | 100ns | $chars{0xcf5a}=0x017d; # capital z with caron | ||
1003 | 1 | 200ns | $chars{0xcf61}=0x01ce; # small a with caron | ||
1004 | 1 | 100ns | $chars{0xcf63}=0x010d; # small c with caron | ||
1005 | 1 | 200ns | $chars{0xcf64}=0x010f; # small d with caron | ||
1006 | 1 | 100ns | $chars{0xcf65}=0x011b; # small e with caron | ||
1007 | 1 | 100ns | $chars{0xcf67}=0x01e7; # small g with caron | ||
1008 | 1 | 100ns | $chars{0xcf69}=0x01d0; # small i with caron | ||
1009 | 1 | 100ns | $chars{0xcf6a}=0x01f0; # small j with caron | ||
1010 | 1 | 100ns | $chars{0xcf6b}=0x01e9; # small k with caron | ||
1011 | 1 | 100ns | $chars{0xcf6c}=0x013e; # small l with caron | ||
1012 | 1 | 100ns | $chars{0xcf6e}=0x0148; # small n with caron | ||
1013 | 1 | 200ns | $chars{0xcf6f}=0x01d2; # small o with caron | ||
1014 | 1 | 100ns | $chars{0xcf72}=0x0159; # small r with caron | ||
1015 | 1 | 100ns | $chars{0xcf73}=0x0161; # small s with caron | ||
1016 | 1 | 200ns | $chars{0xcf74}=0x0165; # small t with caron | ||
1017 | 1 | 100ns | $chars{0xcf75}=0x01d4; # small u with caron | ||
1018 | 1 | 200ns | $chars{0xcf7a}=0x017e; # small z with caron | ||
1019 | # 5/0 cedilla | ||||
1020 | 1 | 200ns | $chars{0xd020}=0x00b8; # cedilla | ||
1021 | 1 | 100ns | $chars{0xd043}=0x00c7; # capital c with cedilla | ||
1022 | 1 | 5µs | $chars{0xd044}=0x1e10; # capital d with cedilla | ||
1023 | 1 | 200ns | $chars{0xd047}=0x0122; # capital g with cedilla | ||
1024 | 1 | 200ns | $chars{0xd048}=0x1e28; # capital h with cedilla | ||
1025 | 1 | 100ns | $chars{0xd04b}=0x0136; # capital k with cedilla | ||
1026 | 1 | 200ns | $chars{0xd04c}=0x013b; # capital l with cedilla | ||
1027 | 1 | 100ns | $chars{0xd04e}=0x0145; # capital n with cedilla | ||
1028 | 1 | 200ns | $chars{0xd052}=0x0156; # capital r with cedilla | ||
1029 | 1 | 200ns | $chars{0xd053}=0x015e; # capital s with cedilla | ||
1030 | 1 | 100ns | $chars{0xd054}=0x0162; # capital t with cedilla | ||
1031 | 1 | 200ns | $chars{0xd063}=0x00e7; # small c with cedilla | ||
1032 | 1 | 200ns | $chars{0xd064}=0x1e11; # small d with cedilla | ||
1033 | 1 | 100ns | $chars{0xd065}=0x0119; # small e with cedilla | ||
1034 | 1 | 200ns | $chars{0xd067}=0x0123; # small g with cedilla | ||
1035 | 1 | 100ns | $chars{0xd068}=0x1e29; # small h with cedilla | ||
1036 | 1 | 200ns | $chars{0xd06b}=0x0137; # small k with cedilla | ||
1037 | 1 | 200ns | $chars{0xd06c}=0x013c; # small l with cedilla | ||
1038 | 1 | 200ns | $chars{0xd06e}=0x0146; # small n with cedilla | ||
1039 | 1 | 100ns | $chars{0xd072}=0x0157; # small r with cedilla | ||
1040 | 1 | 100ns | $chars{0xd073}=0x015f; # small s with cedilla | ||
1041 | 1 | 100ns | $chars{0xd074}=0x0163; # small t with cedilla | ||
1042 | # 5/1 rude | ||||
1043 | # 5/2 hook to left | ||||
1044 | # 5/3 ogonek (hook to right | ||||
1045 | 1 | 100ns | $chars{0xd320}=0x02db; # ogonek | ||
1046 | 1 | 100ns | $chars{0xd341}=0x0104; # capital a with ogonek | ||
1047 | 1 | 100ns | $chars{0xd345}=0x0118; # capital e with ogonek | ||
1048 | 1 | 200ns | $chars{0xd349}=0x012e; # capital i with ogonek | ||
1049 | 1 | 100ns | $chars{0xd34f}=0x01ea; # capital o with ogonek | ||
1050 | 1 | 200ns | $chars{0xd355}=0x0172; # capital u with ogonek | ||
1051 | 1 | 200ns | $chars{0xd361}=0x0105; # small a with ogonek | ||
1052 | 1 | 100ns | $chars{0xd365}=0x0119; # small e with ogonek | ||
1053 | 1 | 200ns | $chars{0xd369}=0x012f; # small i with ogonek | ||
1054 | 1 | 100ns | $chars{0xd36f}=0x01eb; # small o with ogonek | ||
1055 | 1 | 200ns | $chars{0xd375}=0x0173; # small u with ogonek | ||
1056 | # 5/4 circle below | ||||
1057 | 1 | 100ns | $chars{0xd441}=0x1e00; # capital a with ring below | ||
1058 | 1 | 200ns | $chars{0xd461}=0x1e01; # small a with ring below | ||
1059 | # 5/5 half circle below | ||||
1060 | 1 | 100ns | $chars{0xf948}=0x1e2a; # capital h with breve below | ||
1061 | 1 | 100ns | $chars{0xf968}=0x1e2b; # small h with breve below | ||
1062 | # 5/6 dot below | ||||
1063 | 1 | 100ns | $chars{0xd641}=0x1ea0; # capital a with dot below | ||
1064 | 1 | 200ns | $chars{0xd642}=0x1e04; # capital b with dot below | ||
1065 | 1 | 100ns | $chars{0xd644}=0x1e0c; # capital d with dot below | ||
1066 | 1 | 200ns | $chars{0xd645}=0x1eb8; # capital e with dot below | ||
1067 | 1 | 100ns | $chars{0xd648}=0x1e24; # capital h with dot below | ||
1068 | 1 | 100ns | $chars{0xd649}=0x1eca; # capital i with dot below | ||
1069 | 1 | 200ns | $chars{0xd64b}=0x1e32; # capital k with dot below | ||
1070 | 1 | 100ns | $chars{0xd64c}=0x1e36; # capital l with dot below | ||
1071 | 1 | 200ns | $chars{0xd64d}=0x1e42; # capital m with dot below | ||
1072 | 1 | 100ns | $chars{0xd64e}=0x1e46; # capital n with dot below | ||
1073 | 1 | 200ns | $chars{0xd64f}=0x1ecc; # capital o with dot below | ||
1074 | 1 | 100ns | $chars{0xd652}=0x1e5a; # capital r with dot below | ||
1075 | 1 | 200ns | $chars{0xd653}=0x1e62; # capital s with dot below | ||
1076 | 1 | 100ns | $chars{0xd654}=0x1e6c; # capital t with dot below | ||
1077 | 1 | 200ns | $chars{0xd655}=0x1ee4; # capital u with dot below | ||
1078 | 1 | 100ns | $chars{0xd656}=0x1e7e; # capital v with dot below | ||
1079 | 1 | 100ns | $chars{0xd657}=0x1e88; # capital w with dot below | ||
1080 | 1 | 200ns | $chars{0xd659}=0x1ef4; # capital y with dot below | ||
1081 | 1 | 100ns | $chars{0xd65a}=0x1e92; # capital z with dot below | ||
1082 | 1 | 100ns | $chars{0xd661}=0x1ea1; # small a with dot below | ||
1083 | 1 | 100ns | $chars{0xd662}=0x1e05; # small b with dot below | ||
1084 | 1 | 100ns | $chars{0xd664}=0x1e0d; # small d with dot below | ||
1085 | 1 | 200ns | $chars{0xd665}=0x1eb9; # small e with dot below | ||
1086 | 1 | 100ns | $chars{0xd668}=0x1e25; # small h with dot below | ||
1087 | 1 | 100ns | $chars{0xd669}=0x1ecb; # small i with dot below | ||
1088 | 1 | 100ns | $chars{0xd66b}=0x1e33; # small k with dot below | ||
1089 | 1 | 100ns | $chars{0xd66c}=0x1e37; # small l with dot below | ||
1090 | 1 | 100ns | $chars{0xd66d}=0x1e43; # small m with dot below | ||
1091 | 1 | 200ns | $chars{0xd66e}=0x1e47; # small n with dot below | ||
1092 | 1 | 100ns | $chars{0xd66f}=0x1ecd; # small o with dot below | ||
1093 | 1 | 100ns | $chars{0xd672}=0x1e5b; # small r with dot below | ||
1094 | 1 | 100ns | $chars{0xd673}=0x1e63; # small s with dot below | ||
1095 | 1 | 200ns | $chars{0xd674}=0x1e6d; # small t with dot below | ||
1096 | 1 | 100ns | $chars{0xd675}=0x1ee5; # small u with dot below | ||
1097 | 1 | 100ns | $chars{0xd676}=0x1e7f; # small v with dot below | ||
1098 | 1 | 100ns | $chars{0xd677}=0x1e89; # small w with dot below | ||
1099 | 1 | 200ns | $chars{0xd679}=0x1ef5; # small y with dot below | ||
1100 | 1 | 100ns | $chars{0xd67a}=0x1e93; # small z with dot below | ||
1101 | # 5/7 double dot below | ||||
1102 | 1 | 200ns | $chars{0xd755}=0x1e72; # capital u with diaeresis below | ||
1103 | 1 | 100ns | $chars{0xd775}=0x1e73; # small u with diaeresis below | ||
1104 | # 5/8 underline | ||||
1105 | 1 | 200ns | $chars{0xd820}=0x005f; # underline | ||
1106 | # 5/9 double underline | ||||
1107 | 1 | 100ns | $chars{0xd920}=0x2017; # double underline | ||
1108 | # 5/10 small low vertical bar | ||||
1109 | 1 | 200ns | $chars{0xda20}=0x02cc; # | ||
1110 | # 5/11 circumflex below | ||||
1111 | # 5/12 (this position shall not be used) | ||||
1112 | # 5/13 left half of ligature sign and of double tilde | ||||
1113 | # 5/14 right half of ligature sign | ||||
1114 | # 5/15 right half of double tilde | ||||
1115 | # map {printf "%x :%x\n",$_,$chars{$_};}keys %chars; | ||||
1116 | |||||
1117 | sub char_decode5426 { | ||||
1118 | my ( $string) = @_; | ||||
1119 | my $result; | ||||
1120 | |||||
1121 | my @data = unpack("C*", $string); | ||||
1122 | my @characters; | ||||
1123 | my $length=scalar(@data); | ||||
1124 | for (my $i = 0; $i < scalar(@data); $i++) { | ||||
1125 | my $char= $data[$i]; | ||||
1126 | if ($char >= 0x00 && $char <= 0x7F){ | ||||
1127 | #IsAscii | ||||
1128 | |||||
1129 | push @characters,$char unless ($char<0x02 ||$char== 0x0F); | ||||
1130 | }elsif (($char >= 0xC0 && $char <= 0xDF)) { | ||||
1131 | #Combined Char | ||||
1132 | my $convchar ; | ||||
1133 | if ($chars{$char*256+$data[$i+1]}) { | ||||
1134 | $convchar= $chars{$char * 256 + $data[$i+1]}; | ||||
1135 | $i++; | ||||
1136 | # printf "char %x $char, char to convert %x , converted %x\n",$char,$char * 256 + $data[$i - 1],$convchar; | ||||
1137 | } elsif ($chars{$char}) { | ||||
1138 | $convchar= $chars{$char}; | ||||
1139 | # printf "0xC char %x, converted %x\n",$char,$chars{$char}; | ||||
1140 | }else { | ||||
1141 | $convchar=$char; | ||||
1142 | } | ||||
1143 | push @characters,$convchar; | ||||
1144 | } else { | ||||
1145 | my $convchar; | ||||
1146 | if ($chars{$char}) { | ||||
1147 | $convchar= $chars{$char}; | ||||
1148 | # printf "char %x, converted %x\n",$char,$chars{$char}; | ||||
1149 | }else { | ||||
1150 | # printf "char %x $char\n",$char; | ||||
1151 | $convchar=$char; | ||||
1152 | } | ||||
1153 | push @characters,$convchar; | ||||
1154 | } | ||||
1155 | } | ||||
1156 | $result=pack "U*",@characters; | ||||
1157 | # $result=~s/\x01//; | ||||
1158 | # $result=~s/\x00//; | ||||
1159 | $result=~s/\x0f//; | ||||
1160 | $result=~s/\x1b.//; | ||||
1161 | $result=~s/\x0e//; | ||||
1162 | $result=~s/\x1b\x5b//; | ||||
1163 | # map{printf "%x",$_} @characters; | ||||
1164 | # printf "\n"; | ||||
1165 | return $result; | ||||
1166 | } | ||||
1167 | |||||
1168 | 1 | 45µs | 1; | ||
1169 | |||||
1170 | |||||
1171 | =head1 AUTHOR | ||||
1172 | |||||
1173 | Koha Development Team <http://koha-community.org/> | ||||
1174 | |||||
1175 | Galen Charlton <galen.charlton@liblime.com> | ||||
1176 | |||||
1177 | =cut |