Profile of C4/Charset.pm

Filename	/mnt/catalyst/koha/C4/Charset.pm
Statements	Executed 355 statements in 4.46ms

Subroutines
Calls	P	F	Exclusive Time	Inclusive Time	Subroutine
1	1	1	2.32ms	2.99ms	C4::Charset::::BEGIN@24C4::Charset::BEGIN@24
1	1	1	434µs	448µs	C4::Charset::::BEGIN@20C4::Charset::BEGIN@20
1	1	1	10µs	13µs	C4::Charset::::BEGIN@25C4::Charset::BEGIN@25
1	1	1	9µs	38µs	C4::Charset::::BEGIN@23C4::Charset::BEGIN@23
1	1	1	9µs	9µs	C4::Charset::::BEGIN@31C4::Charset::BEGIN@31
1	1	1	9µs	41µs	C4::Charset::::BEGIN@27C4::Charset::BEGIN@27
1	1	1	8µs	72µs	C4::Charset::::BEGIN@26C4::Charset::BEGIN@26
1	1	1	7µs	14µs	C4::Charset::::BEGIN@21C4::Charset::BEGIN@21
1	1	1	6µs	50µs	C4::Charset::::BEGIN@29C4::Charset::BEGIN@29
0	0	0	0s	0s	C4::Charset::::IsStringUTF8ishC4::Charset::IsStringUTF8ish
0	0	0	0s	0s	C4::Charset::::MarcToUTF8RecordC4::Charset::MarcToUTF8Record
0	0	0	0s	0s	C4::Charset::::NormalizeStringC4::Charset::NormalizeString
0	0	0	0s	0s	C4::Charset::::SetMarcUnicodeFlagC4::Charset::SetMarcUnicodeFlag
0	0	0	0s	0s	C4::Charset::::SetUTF8FlagC4::Charset::SetUTF8Flag
0	0	0	0s	0s	C4::Charset::::StripNonXmlCharsC4::Charset::StripNonXmlChars
0	0	0	0s	0s	C4::Charset::::__ANON__[:546]C4::Charset::__ANON__[:546]
0	0	0	0s	0s	C4::Charset::_default_marc21_charconv_to_utf8
0	0	0	0s	0s	C4::Charset::_default_unimarc_charconv_to_utf8
0	0	0	0s	0s	C4::Charset::::_marc_iso5426_to_utf8C4::Charset::_marc_iso5426_to_utf8
0	0	0	0s	0s	C4::Charset::::_marc_marc8_to_utf8C4::Charset::_marc_marc8_to_utf8
0	0	0	0s	0s	C4::Charset::_marc_to_utf8_replacement_char
0	0	0	0s	0s	C4::Charset::_marc_to_utf8_via_text_iconv
0	0	0	0s	0s	C4::Charset::::char_decode5426C4::Charset::char_decode5426
0	0	0	0s	0s	C4::Charset::::nsb_cleanC4::Charset::nsb_clean

Call graph for these subroutines as a Graphviz dot language file.

Line	State ments	Time on line	Calls	Time in subs	Code
1					package C4::Charset;
2
3					# Copyright (C) 2008 LibLime
4					#
5					# This file is part of Koha.
6					#
7					# Koha is free software; you can redistribute it and/or modify it under the
8					# terms of the GNU General Public License as published by the Free Software
9					# Foundation; either version 2 of the License, or (at your option) any later
10					# version.
11					#
12					# Koha is distributed in the hope that it will be useful, but WITHOUT ANY
13					# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14					# A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15					#
16					# You should have received a copy of the GNU General Public License along
17					# with Koha; if not, write to the Free Software Foundation, Inc.,
18					# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19
20	2	28µs	2	462µs	# spent 448µs (434+14) within C4::Charset::BEGIN@20 which was called: # once (434µs+14µs) by C4::Biblio::BEGIN@37 at line 20 use strict; # spent 448µs making 1 call to C4::Charset::BEGIN@20 # spent 14µs making 1 call to strict::import
21	2	24µs	2	20µs	# spent 14µs (7+6) within C4::Charset::BEGIN@21 which was called: # once (7µs+6µs) by C4::Biblio::BEGIN@37 at line 21 use warnings; # spent 14µs making 1 call to C4::Charset::BEGIN@21 # spent 6µs making 1 call to warnings::import
22
23	2	25µs	2	67µs	# spent 38µs (9+29) within C4::Charset::BEGIN@23 which was called: # once (9µs+29µs) by C4::Biblio::BEGIN@37 at line 23 use MARC::Charset qw/marc8_to_utf8/; # spent 38µs making 1 call to C4::Charset::BEGIN@23 # spent 29µs making 1 call to Exporter::import
24	2	679µs	2	3.00ms	# spent 2.99ms (2.32+670µs) within C4::Charset::BEGIN@24 which was called: # once (2.32ms+670µs) by C4::Biblio::BEGIN@37 at line 24 use Text::Iconv; # spent 2.99ms making 1 call to C4::Charset::BEGIN@24 # spent 15µs making 1 call to Exporter::import
25	2	22µs	2	15µs	# spent 13µs (10+3) within C4::Charset::BEGIN@25 which was called: # once (10µs+3µs) by C4::Biblio::BEGIN@37 at line 25 use C4::Context; # spent 13µs making 1 call to C4::Charset::BEGIN@25 # spent 3µs making 1 call to C4::Context::import
26	2	25µs	2	136µs	# spent 72µs (8+64) within C4::Charset::BEGIN@26 which was called: # once (8µs+64µs) by C4::Biblio::BEGIN@37 at line 26 use C4::Debug; # spent 72µs making 1 call to C4::Charset::BEGIN@26 # spent 64µs making 1 call to Exporter::import
27	2	27µs	2	73µs	# spent 41µs (9+32) within C4::Charset::BEGIN@27 which was called: # once (9µs+32µs) by C4::Biblio::BEGIN@37 at line 27 use Unicode::Normalize; # spent 41µs making 1 call to C4::Charset::BEGIN@27 # spent 32µs making 1 call to Exporter::import
28
29	2	47µs	2	94µs	# spent 50µs (6+44) within C4::Charset::BEGIN@29 which was called: # once (6µs+44µs) by C4::Biblio::BEGIN@37 at line 29 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); # spent 50µs making 1 call to C4::Charset::BEGIN@29 # spent 44µs making 1 call to vars::import
30
31					# spent 9µs within C4::Charset::BEGIN@31 which was called: # once (9µs+0s) by C4::Biblio::BEGIN@37 at line 45 BEGIN {
32					# set the version for version checking
33	1	800ns			$VERSION = 3.07.00.049;
34	1	400ns			require Exporter;
35	1	5µs			@ISA = qw(Exporter);
36	1	4µs			@EXPORT = qw(
37					NormalizeString
38					IsStringUTF8ish
39					MarcToUTF8Record
40					SetUTF8Flag
41					SetMarcUnicodeFlag
42					StripNonXmlChars
43					nsb_clean
44					);
45	1	3.47ms	1	9µs	} # spent 9µs making 1 call to C4::Charset::BEGIN@31
46
47					=head1 NAME
48
49					C4::Charset - utilities for handling character set conversions.
50
51					=head1 SYNOPSIS
52
53					use C4::Charset;
54
55					=head1 DESCRIPTION
56
57					This module contains routines for dealing with character set
58					conversions, particularly for MARC records.
59
60					A variety of character encodings are in use by various MARC
61					standards, and even more character encodings are used by
62					non-standard MARC records. The various MARC formats generally
63					do not do a good job of advertising a given record's character
64					encoding, and even when a record does advertise its encoding,
65					e.g., via the Leader/09, experience has shown that one cannot
66					trust it.
67
68					Ultimately, all MARC records are stored in Koha in UTF-8 and
69					must be converted from whatever the source character encoding is.
70					The goal of this module is to ensure that these conversions
71					take place accurately. When a character conversion cannot take
72					place, or at least not accurately, the module was provide
73					enough information to allow user-facing code to inform the user
74					on how to deal with the situation.
75
76					=cut
77
78					=head1 FUNCTIONS
79
80					=head2 IsStringUTF8ish
81
82					my $is_utf8 = IsStringUTF8ish($str);
83
84					Determines if C<$str> is valid UTF-8. This can mean
85					one of two things:
86
87					=over
88
89					=item *
90
91					The Perl UTF-8 flag is set and the string contains valid UTF-8.
92
93					=item *
94
95					The Perl UTF-8 flag is B<not> set, but the octets contain
96					valid UTF-8.
97
98					=back
99
100					The function is named C<IsStringUTF8ish> instead of C<IsStringUTF8>
101					because in one could be presented with a MARC blob that is
102					not actually in UTF-8 but whose sequence of octets appears to be
103					valid UTF-8. The rest of the MARC character conversion functions
104					will assume that this situation occur does not very often.
105
106					=cut
107
108					sub IsStringUTF8ish {
109					my $str = shift;
110
111					return 1 if utf8::is_utf8($str);
112					return utf8::decode($str);
113					}
114
115					=head2 SetUTF8Flag
116
117					my $marc_record = SetUTF8Flag($marc_record, $nfd);
118
119					This function sets the PERL UTF8 flag for data.
120					It is required when using new_from_usmarc
121					since MARC::File::USMARC does not handle PERL UTF8 setting.
122					When editing unicode marc records fields and subfields, you
123					would end up in double encoding without using this function.
124
125					If $nfd is set, string normalization will use NFD instead of NFC
126
127					FIXME
128					In my opinion, this function belongs to MARC::Record and not
129					to this package.
130					But since it handles charset, and MARC::Record, it finds its way in that package
131
132					=cut
133
134					sub SetUTF8Flag{
135					my ($record, $nfd)=@_;
136					return unless ($record && $record->fields());
137					foreach my $field ($record->fields()){
138					if ($field->tag()>=10){
139					my @subfields;
140					foreach my $subfield ($field->subfields()){
141					push @subfields,($$subfield[0],NormalizeString($$subfield[1],$nfd));
142					}
143					eval {
144					my $newfield=MARC::Field->new(
145					$field->tag(),
146					$field->indicator(1),
147					$field->indicator(2),
148					@subfields
149					);
150					$field->replace_with($newfield);
151					};
152					warn "ERROR occurred in SetUTF8Flag $@" if $@;
153					}
154					}
155					}
156
157					=head2 NormalizeString
158
159					my $normalized_string=NormalizeString($string,$nfd,$transform);
160
161					Given a string
162					nfd : If you want to set NFD and not NFC
163					transform : If you expect all the signs to be removed
164
165					Sets the PERL UTF8 Flag on your initial data if need be
166					and applies cleaning if required
167
168					Returns a utf8 NFC normalized string
169
170					Sample code :
171					my $string=NormalizeString ("l'ornithoptère");
172					#results into ornithoptère in NFC form and sets UTF8 Flag
173
174					=cut
175
176
177					sub NormalizeString{
178					my ($string,$nfd,$transform)=@_;
179					utf8::decode($string) unless (utf8::is_utf8($string));
180					if ($nfd){
181					$string= NFD($string);
182					}
183					else {
184					$string=NFC($string);
185					}
186					if ($transform){
187					$string=~s/\<\|\>\|\^\|\;\|\.\|\?\|,\|\-\|$\|$\|\[\|\]\|\{\|\}\|\$\|\%\|\!\|\*\|\:\|\\\|\/\|\&\|\"\|\'/ /g;
188					#removing one letter words "d'" "l'" was changed into "d " "l "
189					$string=~s/\b\S\b//g;
190					$string=~s/\s+$//g;
191					}
192					return $string;
193					}
194
195					=head2 MarcToUTF8Record
196
197					($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob,
198					$marc_flavour, [, $source_encoding]);
199
200					Given a MARC blob or a C<MARC::Record>, the MARC flavour, and an
201					optional source encoding, return a C<MARC::Record> that is
202					converted to UTF-8.
203
204					The returned C<$marc_record> is guaranteed to be in valid UTF-8, but
205					is not guaranteed to have been converted correctly. Specifically,
206					if C<$converted_from> is 'failed', the MARC record returned failed
207					character conversion and had each of its non-ASCII octets changed
208					to the Unicode replacement character.
209
210					If the source encoding was not specified, this routine will
211					try to guess it; the character encoding used for a successful
212					conversion is returned in C<$converted_from>.
213
214					=cut
215
216					sub MarcToUTF8Record {
217					my $marc = shift;
218					my $marc_flavour = shift;
219					my $source_encoding = shift;
220					my $marc_record;
221					my $marc_blob_is_utf8 = 0;
222					if (ref($marc) eq 'MARC::Record') {
223					my $marc_blob = $marc->as_usmarc();
224					$marc_blob_is_utf8 = IsStringUTF8ish($marc_blob);
225					$marc_record = $marc;
226					} else {
227					# dealing with a MARC blob
228
229					# remove any ersatz whitespace from the beginning and
230					# end of the MARC blob -- these can creep into MARC
231					# files produced by several sources -- caller really
232					# should be doing this, however
233					$marc =~ s/^\s+//;
234					$marc =~ s/\s+$//;
235					$marc_blob_is_utf8 = IsStringUTF8ish($marc);
236					eval {
237					$marc_record = MARC::Record->new_from_usmarc($marc);
238					};
239					if ($@) {
240					# if we fail the first time, one likely problem
241					# is that we have a MARC21 record that says that it's
242					# UTF-8 (Leader/09 = 'a') but contains non-UTF-8 characters.
243					# We'll try parsing it again.
244					substr($marc, 9, 1) = ' ';
245					eval {
246					$marc_record = MARC::Record->new_from_usmarc($marc);
247					};
248					if ($@) {
249					# it's hopeless; return an empty MARC::Record
250					return MARC::Record->new(), 'failed', ['could not parse MARC blob'];
251					}
252					}
253					}
254
255					# If we do not know the source encoding, try some guesses
256					# as follows:
257					# 1. Record is UTF-8 already.
258					# 2. If MARC flavor is MARC21 or NORMARC, then
259					# a. record is MARC-8
260					# b. record is ISO-8859-1
261					# 3. If MARC flavor is UNIMARC, then
262					if (not defined $source_encoding) {
263					if ($marc_blob_is_utf8) {
264					# note that for MARC21/NORMARC we are not bothering to check
265					# if the Leader/09 is set to 'a' or not -- because
266					# of problems with various ILSs (including Koha in the
267					# past, alas), this just is not trustworthy.
268					SetMarcUnicodeFlag($marc_record, $marc_flavour);
269					return $marc_record, 'UTF-8', [];
270					} else {
271					if ($marc_flavour eq 'MARC21' \|\| $marc_flavour eq 'NORMARC') {
272					return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
273					} elsif ($marc_flavour =~/UNIMARC/) {
274					return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour);
275					} else {
276					return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
277					}
278					}
279					} else {
280					# caller knows the character encoding
281					my $original_marc_record = $marc_record->clone();
282					my @errors;
283					if ($source_encoding =~ /utf-?8/i) {
284					if ($marc_blob_is_utf8) {
285					SetMarcUnicodeFlag($marc_record, $marc_flavour);
286					return $marc_record, 'UTF-8', [];
287					} else {
288					push @errors, 'specified UTF-8 => UTF-8 conversion, but record is not in UTF-8';
289					}
290					} elsif ($source_encoding =~ /marc-?8/i) {
291					@errors = _marc_marc8_to_utf8($marc_record, $marc_flavour);
292					} elsif ($source_encoding =~ /5426/) {
293					@errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour);
294					} else {
295					# assume any other character encoding is for Text::Iconv
296					@errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
297					}
298
299					if (@errors) {
300					_marc_to_utf8_replacement_char($original_marc_record, $marc_flavour);
301					return $original_marc_record, 'failed', \@errors;
302					} else {
303					return $marc_record, $source_encoding, [];
304					}
305					}
306
307					}
308
309					=head2 SetMarcUnicodeFlag
310
311					SetMarcUnicodeFlag($marc_record, $marc_flavour);
312
313					Set both the internal MARC::Record encoding flag
314					and the appropriate Leader/09 (MARC21) or
315					100/26-29 (UNIMARC) to indicate that the record
316					is in UTF-8. Note that this does B<not> do
317					any actual character conversion.
318
319					=cut
320
321					sub SetMarcUnicodeFlag {
322					my $marc_record = shift;
323					my $marc_flavour = shift; # \|\| C4::Context->preference("marcflavour");
324
325					$marc_record->encoding('UTF-8');
326					if ($marc_flavour eq 'MARC21' \|\| $marc_flavour eq 'NORMARC') {
327					my $leader = $marc_record->leader();
328					substr($leader, 9, 1) = 'a';
329					$marc_record->leader($leader);
330					} elsif ($marc_flavour =~/UNIMARC/) {
331					my $defaultlanguage = C4::Context->preference("UNIMARCField100Language");
332					$defaultlanguage = "fre" if (!$defaultlanguage \|\| length($defaultlanguage) != 3);
333					my $string;
334					my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?(21,12):(36,25));
335					$string=$marc_record->subfield( 100, "a" );
336					if (defined $string && length($string)==$subflength) {
337					$string = substr $string, 0,$subflength if (length($string)>$subflength);
338					}
339					else {
340					$string = POSIX::strftime( "%Y%m%d", localtime );
341					$string =~ s/\-//g;
342					$string = sprintf( "%-*s", $subflength, $string );
343					substr ( $string, ($encodingposition - 3), 3, $defaultlanguage);
344					}
345					substr( $string, $encodingposition, 3, "y50" );
346					if ( $marc_record->subfield( 100, "a" ) ) {
347					$marc_record->field('100')->update(a=>$string);
348					}
349					else {
350					$marc_record->insert_grouped_field(
351					MARC::Field->new( 100, '', '', "a" => $string ) );
352					}
353					$debug && warn "encodage: ", substr( $marc_record->subfield(100, 'a'), $encodingposition, 3 );
354					} else {
355					warn "Unrecognized marcflavour: $marc_flavour";
356					}
357					}
358
359					=head2 StripNonXmlChars
360
361					my $new_str = StripNonXmlChars($old_str);
362
363					Given a string, return a copy with the
364					characters that are illegal in XML
365					removed.
366
367					This function exists to work around a problem
368					that can occur with badly-encoded MARC records.
369					Specifically, if a UTF-8 MARC record also
370					has excape (\x1b) characters, MARC::File::XML
371					will let the escape characters pass through
372					when as_xml() or as_xml_record() is called. The
373					problem is that the escape character is not
374					legal in well-formed XML documents, so when
375					MARC::File::XML attempts to parse such a record,
376					the XML parser will fail.
377
378					Stripping such characters will allow a
379					MARC::Record->new_from_xml()
380					to work, at the possible risk of some data loss.
381
382					=cut
383
384					sub StripNonXmlChars {
385					my $str = shift;
386					if (!defined($str) \|\| $str eq ""){
387					return "";
388					}
389					$str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g;
390					return $str;
391					}
392
- -
395					=head2 nsb_clean
396
397					=over 4
398
399					nsb_clean($string);
400
401					=back
402
403					Removes Non Sorting Block characters
404
405					=cut
406					sub nsb_clean {
407					my $NSB = '\x88' ; # NSB : begin Non Sorting Block
408					my $NSE = '\x89' ; # NSE : Non Sorting Block end
409					my $NSB2 = '\x98' ; # NSB : begin Non Sorting Block
410					my $NSE2 = '\x9C' ; # NSE : Non Sorting Block end
411					my $C2 = '\xC2' ; # What is this char ? It is sometimes left by the regexp after removing NSB / NSE
412
413					# handles non sorting blocks
414					my ($string) = @_ ;
415					$_ = $string ;
416					s/$NSB//g ;
417					s/$NSE//g ;
418					s/$NSB2//g ;
419					s/$NSE2//g ;
420					s/$C2//g ;
421					$string = $_ ;
422
423					return($string) ;
424					}
425
426
427					=head1 INTERNAL FUNCTIONS
428
429					=head2 _default_marc21_charconv_to_utf8
430
431					my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record);
432
433					Converts a C<MARC::Record> of unknown character set to UTF-8,
434					first by trying a MARC-8 to UTF-8 conversion, then ISO-8859-1
435					to UTF-8, then a default conversion that replaces each non-ASCII
436					character with the replacement character.
437
438					The C<$guessed_charset> return value contains the character set
439					that resulted in a conversion to valid UTF-8; note that
440					if the MARC-8 and ISO-8859-1 conversions failed, the value of
441					this is 'failed'.
442
443					=cut
444
445					sub _default_marc21_charconv_to_utf8 {
446					my $marc_record = shift;
447					my $marc_flavour = shift;
448
449					my $trial_marc8 = $marc_record->clone();
450					my @all_errors = ();
451					my @errors = _marc_marc8_to_utf8($trial_marc8, $marc_flavour);
452					unless (@errors) {
453					return $trial_marc8, 'MARC-8', [];
454					}
455					push @all_errors, @errors;
456
457					my $trial_8859_1 = $marc_record->clone();
458					@errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
459					unless (@errors) {
460					return $trial_8859_1, 'iso-8859-1', []; # note -- we could return \@all_errors
461					# instead if we wanted to report details
462					# of the failed attempt at MARC-8 => UTF-8
463					}
464					push @all_errors, @errors;
465
466					my $default_converted = $marc_record->clone();
467					_marc_to_utf8_replacement_char($default_converted, $marc_flavour);
468					return $default_converted, 'failed', \@all_errors;
469					}
470
471					=head2 _default_unimarc_charconv_to_utf8
472
473					my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record);
474
475					Converts a C<MARC::Record> of unknown character set to UTF-8,
476					first by trying a ISO-5426 to UTF-8 conversion, then ISO-8859-1
477					to UTF-8, then a default conversion that replaces each non-ASCII
478					character with the replacement character.
479
480					The C<$guessed_charset> return value contains the character set
481					that resulted in a conversion to valid UTF-8; note that
482					if the MARC-8 and ISO-8859-1 conversions failed, the value of
483					this is 'failed'.
484
485					=cut
486
487					sub _default_unimarc_charconv_to_utf8 {
488					my $marc_record = shift;
489					my $marc_flavour = shift;
490
491					my $trial_marc8 = $marc_record->clone();
492					my @all_errors = ();
493					my @errors = _marc_iso5426_to_utf8($trial_marc8, $marc_flavour);
494					unless (@errors) {
495					return $trial_marc8, 'iso-5426';
496					}
497					push @all_errors, @errors;
498
499					my $trial_8859_1 = $marc_record->clone();
500					@errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
501					unless (@errors) {
502					return $trial_8859_1, 'iso-8859-1';
503					}
504					push @all_errors, @errors;
505
506					my $default_converted = $marc_record->clone();
507					_marc_to_utf8_replacement_char($default_converted, $marc_flavour);
508					return $default_converted, 'failed', \@all_errors;
509					}
510
511					=head2 _marc_marc8_to_utf8
512
513					my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding);
514
515					Convert a C<MARC::Record> to UTF-8 in-place from MARC-8.
516					If the conversion fails for some reason, an
517					appropriate messages will be placed in the returned
518					C<@errors> array.
519
520					=cut
521
522					sub _marc_marc8_to_utf8 {
523					my $marc_record = shift;
524					my $marc_flavour = shift;
525
526					my $prev_ignore = MARC::Charset->ignore_errors();
527					MARC::Charset->ignore_errors(1);
528
529					# trap warnings raised by MARC::Charset
530					my @errors = ();
531					local $SIG{__WARN__} = sub {
532					my $msg = $_[0];
533					if ($msg =~ /MARC.Charset/) {
534					# FIXME - purpose of this regexp is to strip out the
535					# line reference to MARC/Charset.pm, but as it
536					# exists probably won't work quite on Windows --
537					# some sort of minimal-bunch back-tracking RE
538					# would be helpful here
539					$msg =~ s/at [\/].*?.MARC.Charset\.pm line \d+\.\n$//;
540					push @errors, $msg;
541					} else {
542					# if warning doesn't come from MARC::Charset, just
543					# pass it on
544					warn $msg;
545					}
546					};
547
548					foreach my $field ($marc_record->fields()) {
549					if ($field->is_control_field()) {
550					; # do nothing -- control fields should not contain non-ASCII characters
551					} else {
552					my @converted_subfields;
553					foreach my $subfield ($field->subfields()) {
554					my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]);
555					unless (IsStringUTF8ish($utf8sf)) {
556					# Because of a bug in MARC::Charset 0.98, if the string
557					# has (a) one or more diacritics that (b) are only in character positions
558					# 128 to 255 inclusive, the resulting converted string is not in
559					# UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1). If that
560					# occurs, upgrade the string in place. Moral of the story seems to be
561					# that pack("U", ...) is better than chr(...) if you need to guarantee
562					# that the resulting string is UTF-8.
563					utf8::upgrade($utf8sf);
564					}
565					push @converted_subfields, $subfield->[0], $utf8sf;
566					}
567
568					$field->replace_with(MARC::Field->new(
569					$field->tag(), $field->indicator(1), $field->indicator(2),
570					@converted_subfields)
571					);
572					}
573					}
574
575					MARC::Charset->ignore_errors($prev_ignore);
576
577					SetMarcUnicodeFlag($marc_record, $marc_flavour);
578
579					return @errors;
580					}
581
582					=head2 _marc_iso5426_to_utf8
583
584					my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding);
585
586					Convert a C<MARC::Record> to UTF-8 in-place from ISO-5426.
587					If the conversion fails for some reason, an
588					appropriate messages will be placed in the returned
589					C<@errors> array.
590
591					FIXME - is ISO-5426 equivalent enough to MARC-8
592					that C<MARC::Charset> can be used instead?
593
594					=cut
595
596					sub _marc_iso5426_to_utf8 {
597					my $marc_record = shift;
598					my $marc_flavour = shift;
599
600					my @errors = ();
601
602					foreach my $field ($marc_record->fields()) {
603					if ($field->is_control_field()) {
604					; # do nothing -- control fields should not contain non-ASCII characters
605					} else {
606					my @converted_subfields;
607					foreach my $subfield ($field->subfields()) {
608					my $utf8sf = char_decode5426($subfield->[1]);
609					push @converted_subfields, $subfield->[0], $utf8sf;
610					}
611
612					$field->replace_with(MARC::Field->new(
613					$field->tag(), $field->indicator(1), $field->indicator(2),
614					@converted_subfields)
615					);
616					}
617					}
618
619					SetMarcUnicodeFlag($marc_record, $marc_flavour);
620
621					return @errors;
622					}
623
624					=head2 _marc_to_utf8_via_text_iconv
625
626					my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
627
628					Convert a C<MARC::Record> to UTF-8 in-place using the
629					C<Text::Iconv> CPAN module. Any source encoding accepted
630					by the user's iconv installation should work. If
631					the source encoding is not recognized on the user's
632					server or the conversion fails for some reason,
633					appropriate messages will be placed in the returned
634					C<@errors> array.
635
636					=cut
637
638					sub _marc_to_utf8_via_text_iconv {
639					my $marc_record = shift;
640					my $marc_flavour = shift;
641					my $source_encoding = shift;
642
643					my @errors = ();
644					my $decoder;
645					eval { $decoder = Text::Iconv->new($source_encoding, 'utf8'); };
646					if ($@) {
647					push @errors, "Could not initialze $source_encoding => utf8 converter: $@";
648					return @errors;
649					}
650
651					my $prev_raise_error = Text::Iconv->raise_error();
652					Text::Iconv->raise_error(1);
653
654					foreach my $field ($marc_record->fields()) {
655					if ($field->is_control_field()) {
656					; # do nothing -- control fields should not contain non-ASCII characters
657					} else {
658					my @converted_subfields;
659					foreach my $subfield ($field->subfields()) {
660					my $converted_value;
661					my $conversion_ok = 1;
662					eval { $converted_value = $decoder->convert($subfield->[1]); };
663					if ($@) {
664					$conversion_ok = 0;
665					push @errors, $@;
666					} elsif (not defined $converted_value) {
667					$conversion_ok = 0;
668					push @errors, "Text::Iconv conversion failed - retval is " . $decoder->retval();
669					}
670
671					if ($conversion_ok) {
672					push @converted_subfields, $subfield->[0], $converted_value;
673					} else {
674					$converted_value = $subfield->[1];
675					$converted_value =~ s/[\200-\377]/\xef\xbf\xbd/g;
676					push @converted_subfields, $subfield->[0], $converted_value;
677					}
678					}
679
680					$field->replace_with(MARC::Field->new(
681					$field->tag(), $field->indicator(1), $field->indicator(2),
682					@converted_subfields)
683					);
684					}
685					}
686
687					SetMarcUnicodeFlag($marc_record, $marc_flavour);
688					Text::Iconv->raise_error($prev_raise_error);
689
690					return @errors;
691					}
692
693					=head2 _marc_to_utf8_replacement_char
694
695					_marc_to_utf8_replacement_char($marc_record, $marc_flavour);
696
697					Convert a C<MARC::Record> to UTF-8 in-place, adopting the
698					unsatisfactory method of replacing all non-ASCII (e.g.,
699					where the eight bit is set) octet with the Unicode
700					replacement character. This is meant as a last-ditch
701					method, and would be best used as part of a UI that
702					lets a cataloguer pick various character conversions
703					until he or she finds the right one.
704
705					=cut
706
707					sub _marc_to_utf8_replacement_char {
708					my $marc_record = shift;
709					my $marc_flavour = shift;
710
711					foreach my $field ($marc_record->fields()) {
712					if ($field->is_control_field()) {
713					; # do nothing -- control fields should not contain non-ASCII characters
714					} else {
715					my @converted_subfields;
716					foreach my $subfield ($field->subfields()) {
717					my $value = $subfield->[1];
718					$value =~ s/[\200-\377]/\xef\xbf\xbd/g;
719					push @converted_subfields, $subfield->[0], $value;
720					}
721
722					$field->replace_with(MARC::Field->new(
723					$field->tag(), $field->indicator(1), $field->indicator(2),
724					@converted_subfields)
725					);
726					}
727					}
728
729					SetMarcUnicodeFlag($marc_record, $marc_flavour);
730					}
731
732					=head2 char_decode5426
733
734					my $utf8string = char_decode5426($iso_5426_string);
735
736					Converts a string from ISO-5426 to UTF-8.
737
738					=cut
739
740
741	1	300ns			my %chars;
742	1	1µs			$chars{0xb0}=0x0101;#3/0ayn[ain]
743	1	200ns			$chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove]
744					#$chars{0xb2}=0x00e0;#'à';
745	1	100ns			$chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark
746					#$chars{0xb3}=0x00e7;#'ç';
747	1	100ns			$chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark
748					# $chars{0xb4}='è';
749	1	200ns			$chars{0xb4}=0x00e8;
750	1	200ns			$chars{0xbd}=0x02b9;
751	1	200ns			$chars{0xbe}=0x02ba;
752					# $chars{0xb5}='é';
753	1	600ns			$chars{0xb5}=0x00e9;
754	1	200ns			$chars{0x97}=0x003c;#3/2leftlowsinglequotationmark
755	1	100ns			$chars{0x98}=0x003e;#3/2leftlowsinglequotationmark
756	1	200ns			$chars{0xfa}=0x0153; #oe
757	1	100ns			$chars{0xea}=0x0152; #oe
758	1	200ns			$chars{0x81d1}=0x00b0;
759
760					####
761					## combined characters iso5426
762
763	1	200ns			$chars{0xc041}=0x1ea2; # capital a with hook above
764	1	200ns			$chars{0xc045}=0x1eba; # capital e with hook above
765	1	800ns			$chars{0xc049}=0x1ec8; # capital i with hook above
766	1	100ns			$chars{0xc04f}=0x1ece; # capital o with hook above
767	1	200ns			$chars{0xc055}=0x1ee6; # capital u with hook above
768	1	200ns			$chars{0xc059}=0x1ef6; # capital y with hook above
769	1	200ns			$chars{0xc061}=0x1ea3; # small a with hook above
770	1	100ns			$chars{0xc065}=0x1ebb; # small e with hook above
771	1	100ns			$chars{0xc069}=0x1ec9; # small i with hook above
772	1	100ns			$chars{0xc06f}=0x1ecf; # small o with hook above
773	1	100ns			$chars{0xc075}=0x1ee7; # small u with hook above
774	1	200ns			$chars{0xc079}=0x1ef7; # small y with hook above
775
776					# 4/1 grave accent
777	1	200ns			$chars{0xc141}=0x00c0; # capital a with grave accent
778	1	100ns			$chars{0xc145}=0x00c8; # capital e with grave accent
779	1	200ns			$chars{0xc149}=0x00cc; # capital i with grave accent
780	1	100ns			$chars{0xc14f}=0x00d2; # capital o with grave accent
781	1	200ns			$chars{0xc155}=0x00d9; # capital u with grave accent
782	1	100ns			$chars{0xc157}=0x1e80; # capital w with grave
783	1	3µs			$chars{0xc159}=0x1ef2; # capital y with grave
784	1	100ns			$chars{0xc161}=0x00e0; # small a with grave accent
785	1	200ns			$chars{0xc165}=0x00e8; # small e with grave accent
786	1	200ns			$chars{0xc169}=0x00ec; # small i with grave accent
787	1	100ns			$chars{0xc16f}=0x00f2; # small o with grave accent
788	1	100ns			$chars{0xc175}=0x00f9; # small u with grave accent
789	1	100ns			$chars{0xc177}=0x1e81; # small w with grave
790	1	200ns			$chars{0xc179}=0x1ef3; # small y with grave
791					# 4/2 acute accent
792	1	100ns			$chars{0xc241}=0x00c1; # capital a with acute accent
793	1	300ns			$chars{0xc243}=0x0106; # capital c with acute accent
794	1	100ns			$chars{0xc245}=0x00c9; # capital e with acute accent
795	1	200ns			$chars{0xc247}=0x01f4; # capital g with acute
796	1	100ns			$chars{0xc249}=0x00cd; # capital i with acute accent
797	1	100ns			$chars{0xc24b}=0x1e30; # capital k with acute
798	1	100ns			$chars{0xc24c}=0x0139; # capital l with acute accent
799	1	200ns			$chars{0xc24d}=0x1e3e; # capital m with acute
800	1	100ns			$chars{0xc24e}=0x0143; # capital n with acute accent
801	1	100ns			$chars{0xc24f}=0x00d3; # capital o with acute accent
802	1	100ns			$chars{0xc250}=0x1e54; # capital p with acute
803	1	200ns			$chars{0xc252}=0x0154; # capital r with acute accent
804	1	100ns			$chars{0xc253}=0x015a; # capital s with acute accent
805	1	200ns			$chars{0xc255}=0x00da; # capital u with acute accent
806	1	100ns			$chars{0xc257}=0x1e82; # capital w with acute
807	1	100ns			$chars{0xc259}=0x00dd; # capital y with acute accent
808	1	100ns			$chars{0xc25a}=0x0179; # capital z with acute accent
809	1	200ns			$chars{0xc261}=0x00e1; # small a with acute accent
810	1	200ns			$chars{0xc263}=0x0107; # small c with acute accent
811	1	100ns			$chars{0xc265}=0x00e9; # small e with acute accent
812	1	200ns			$chars{0xc267}=0x01f5; # small g with acute
813	1	100ns			$chars{0xc269}=0x00ed; # small i with acute accent
814	1	100ns			$chars{0xc26b}=0x1e31; # small k with acute
815	1	100ns			$chars{0xc26c}=0x013a; # small l with acute accent
816	1	1µs			$chars{0xc26d}=0x1e3f; # small m with acute
817	1	100ns			$chars{0xc26e}=0x0144; # small n with acute accent
818	1	200ns			$chars{0xc26f}=0x00f3; # small o with acute accent
819	1	100ns			$chars{0xc270}=0x1e55; # small p with acute
820	1	100ns			$chars{0xc272}=0x0155; # small r with acute accent
821	1	100ns			$chars{0xc273}=0x015b; # small s with acute accent
822	1	100ns			$chars{0xc275}=0x00fa; # small u with acute accent
823	1	100ns			$chars{0xc277}=0x1e83; # small w with acute
824	1	100ns			$chars{0xc279}=0x00fd; # small y with acute accent
825	1	200ns			$chars{0xc27a}=0x017a; # small z with acute accent
826	1	100ns			$chars{0xc2e1}=0x01fc; # capital ae with acute
827	1	200ns			$chars{0xc2f1}=0x01fd; # small ae with acute
828					# 4/3 circumflex accent
829	1	200ns			$chars{0xc341}=0x00c2; # capital a with circumflex accent
830	1	200ns			$chars{0xc343}=0x0108; # capital c with circumflex
831	1	200ns			$chars{0xc345}=0x00ca; # capital e with circumflex accent
832	1	100ns			$chars{0xc347}=0x011c; # capital g with circumflex
833	1	200ns			$chars{0xc348}=0x0124; # capital h with circumflex
834	1	100ns			$chars{0xc349}=0x00ce; # capital i with circumflex accent
835	1	200ns			$chars{0xc34a}=0x0134; # capital j with circumflex
836	1	100ns			$chars{0xc34f}=0x00d4; # capital o with circumflex accent
837	1	200ns			$chars{0xc353}=0x015c; # capital s with circumflex
838	1	100ns			$chars{0xc355}=0x00db; # capital u with circumflex
839	1	200ns			$chars{0xc357}=0x0174; # capital w with circumflex
840	1	100ns			$chars{0xc359}=0x0176; # capital y with circumflex
841	1	100ns			$chars{0xc35a}=0x1e90; # capital z with circumflex
842	1	100ns			$chars{0xc361}=0x00e2; # small a with circumflex accent
843	1	200ns			$chars{0xc363}=0x0109; # small c with circumflex
844	1	100ns			$chars{0xc365}=0x00ea; # small e with circumflex accent
845	1	200ns			$chars{0xc367}=0x011d; # small g with circumflex
846	1	100ns			$chars{0xc368}=0x0125; # small h with circumflex
847	1	200ns			$chars{0xc369}=0x00ee; # small i with circumflex accent
848	1	100ns			$chars{0xc36a}=0x0135; # small j with circumflex
849	1	100ns			$chars{0xc36e}=0x00f1; # small n with tilde
850	1	200ns			$chars{0xc36f}=0x00f4; # small o with circumflex accent
851	1	200ns			$chars{0xc373}=0x015d; # small s with circumflex
852	1	100ns			$chars{0xc375}=0x00fb; # small u with circumflex
853	1	200ns			$chars{0xc377}=0x0175; # small w with circumflex
854	1	200ns			$chars{0xc379}=0x0177; # small y with circumflex
855	1	200ns			$chars{0xc37a}=0x1e91; # small z with circumflex
856					# 4/4 tilde
857	1	200ns			$chars{0xc441}=0x00c3; # capital a with tilde
858	1	100ns			$chars{0xc445}=0x1ebc; # capital e with tilde
859	1	200ns			$chars{0xc449}=0x0128; # capital i with tilde
860	1	100ns			$chars{0xc44e}=0x00d1; # capital n with tilde
861	1	200ns			$chars{0xc44f}=0x00d5; # capital o with tilde
862	1	100ns			$chars{0xc455}=0x0168; # capital u with tilde
863	1	200ns			$chars{0xc456}=0x1e7c; # capital v with tilde
864	1	100ns			$chars{0xc459}=0x1ef8; # capital y with tilde
865	1	200ns			$chars{0xc461}=0x00e3; # small a with tilde
866	1	100ns			$chars{0xc465}=0x1ebd; # small e with tilde
867	1	200ns			$chars{0xc469}=0x0129; # small i with tilde
868	1	200ns			$chars{0xc46e}=0x00f1; # small n with tilde
869	1	100ns			$chars{0xc46f}=0x00f5; # small o with tilde
870	1	200ns			$chars{0xc475}=0x0169; # small u with tilde
871	1	200ns			$chars{0xc476}=0x1e7d; # small v with tilde
872	1	100ns			$chars{0xc479}=0x1ef9; # small y with tilde
873					# 4/5 macron
874	1	200ns			$chars{0xc541}=0x0100; # capital a with macron
875	1	200ns			$chars{0xc545}=0x0112; # capital e with macron
876	1	100ns			$chars{0xc547}=0x1e20; # capital g with macron
877	1	200ns			$chars{0xc549}=0x012a; # capital i with macron
878	1	200ns			$chars{0xc54f}=0x014c; # capital o with macron
879	1	100ns			$chars{0xc555}=0x016a; # capital u with macron
880	1	200ns			$chars{0xc561}=0x0101; # small a with macron
881	1	500ns			$chars{0xc565}=0x0113; # small e with macron
882	1	200ns			$chars{0xc567}=0x1e21; # small g with macron
883	1	2µs			$chars{0xc569}=0x012b; # small i with macron
884	1	200ns			$chars{0xc56f}=0x014d; # small o with macron
885	1	200ns			$chars{0xc575}=0x016b; # small u with macron
886	1	200ns			$chars{0xc572}=0x0159; # small r with macron
887	1	100ns			$chars{0xc5e1}=0x01e2; # capital ae with macron
888	1	200ns			$chars{0xc5f1}=0x01e3; # small ae with macron
889					# 4/6 breve
890	1	100ns			$chars{0xc641}=0x0102; # capital a with breve
891	1	200ns			$chars{0xc645}=0x0114; # capital e with breve
892	1	100ns			$chars{0xc647}=0x011e; # capital g with breve
893	1	200ns			$chars{0xc649}=0x012c; # capital i with breve
894	1	100ns			$chars{0xc64f}=0x014e; # capital o with breve
895	1	200ns			$chars{0xc655}=0x016c; # capital u with breve
896	1	100ns			$chars{0xc661}=0x0103; # small a with breve
897	1	200ns			$chars{0xc665}=0x0115; # small e with breve
898	1	100ns			$chars{0xc667}=0x011f; # small g with breve
899	1	100ns			$chars{0xc669}=0x012d; # small i with breve
900	1	100ns			$chars{0xc66f}=0x014f; # small o with breve
901	1	200ns			$chars{0xc675}=0x016d; # small u with breve
902					# 4/7 dot above
903	1	100ns			$chars{0xc7b0}=0x01e1; # Ain with dot above
904	1	200ns			$chars{0xc742}=0x1e02; # capital b with dot above
905	1	200ns			$chars{0xc743}=0x010a; # capital c with dot above
906	1	100ns			$chars{0xc744}=0x1e0a; # capital d with dot above
907	1	200ns			$chars{0xc745}=0x0116; # capital e with dot above
908	1	100ns			$chars{0xc746}=0x1e1e; # capital f with dot above
909	1	200ns			$chars{0xc747}=0x0120; # capital g with dot above
910	1	200ns			$chars{0xc748}=0x1e22; # capital h with dot above
911	1	100ns			$chars{0xc749}=0x0130; # capital i with dot above
912	1	200ns			$chars{0xc74d}=0x1e40; # capital m with dot above
913	1	200ns			$chars{0xc74e}=0x1e44; # capital n with dot above
914	1	100ns			$chars{0xc750}=0x1e56; # capital p with dot above
915	1	200ns			$chars{0xc752}=0x1e58; # capital r with dot above
916	1	100ns			$chars{0xc753}=0x1e60; # capital s with dot above
917	1	100ns			$chars{0xc754}=0x1e6a; # capital t with dot above
918	1	200ns			$chars{0xc757}=0x1e86; # capital w with dot above
919	1	100ns			$chars{0xc758}=0x1e8a; # capital x with dot above
920	1	200ns			$chars{0xc759}=0x1e8e; # capital y with dot above
921	1	200ns			$chars{0xc75a}=0x017b; # capital z with dot above
922	1	200ns			$chars{0xc761}=0x0227; # small b with dot above
923	1	100ns			$chars{0xc762}=0x1e03; # small b with dot above
924	1	200ns			$chars{0xc763}=0x010b; # small c with dot above
925	1	100ns			$chars{0xc764}=0x1e0b; # small d with dot above
926	1	100ns			$chars{0xc765}=0x0117; # small e with dot above
927	1	100ns			$chars{0xc766}=0x1e1f; # small f with dot above
928	1	200ns			$chars{0xc767}=0x0121; # small g with dot above
929	1	100ns			$chars{0xc768}=0x1e23; # small h with dot above
930	1	100ns			$chars{0xc76d}=0x1e41; # small m with dot above
931	1	100ns			$chars{0xc76e}=0x1e45; # small n with dot above
932	1	100ns			$chars{0xc770}=0x1e57; # small p with dot above
933	1	100ns			$chars{0xc772}=0x1e59; # small r with dot above
934	1	200ns			$chars{0xc773}=0x1e61; # small s with dot above
935	1	100ns			$chars{0xc774}=0x1e6b; # small t with dot above
936	1	200ns			$chars{0xc777}=0x1e87; # small w with dot above
937	1	100ns			$chars{0xc778}=0x1e8b; # small x with dot above
938	1	100ns			$chars{0xc779}=0x1e8f; # small y with dot above
939	1	200ns			$chars{0xc77a}=0x017c; # small z with dot above
940					# 4/8 trema, diaresis
941	1	100ns			$chars{0xc820}=0x00a8; # diaeresis
942	1	200ns			$chars{0xc841}=0x00c4; # capital a with diaeresis
943	1	200ns			$chars{0xc845}=0x00cb; # capital e with diaeresis
944	1	100ns			$chars{0xc848}=0x1e26; # capital h with diaeresis
945	1	200ns			$chars{0xc849}=0x00cf; # capital i with diaeresis
946	1	100ns			$chars{0xc84f}=0x00d6; # capital o with diaeresis
947	1	600ns			$chars{0xc855}=0x00dc; # capital u with diaeresis
948	1	200ns			$chars{0xc857}=0x1e84; # capital w with diaeresis
949	1	100ns			$chars{0xc858}=0x1e8c; # capital x with diaeresis
950	1	100ns			$chars{0xc859}=0x0178; # capital y with diaeresis
951	1	200ns			$chars{0xc861}=0x00e4; # small a with diaeresis
952	1	100ns			$chars{0xc865}=0x00eb; # small e with diaeresis
953	1	200ns			$chars{0xc868}=0x1e27; # small h with diaeresis
954	1	200ns			$chars{0xc869}=0x00ef; # small i with diaeresis
955	1	100ns			$chars{0xc86f}=0x00f6; # small o with diaeresis
956	1	100ns			$chars{0xc874}=0x1e97; # small t with diaeresis
957	1	200ns			$chars{0xc875}=0x00fc; # small u with diaeresis
958	1	200ns			$chars{0xc877}=0x1e85; # small w with diaeresis
959	1	200ns			$chars{0xc878}=0x1e8d; # small x with diaeresis
960	1	2µs			$chars{0xc879}=0x00ff; # small y with diaeresis
961					# 4/9 umlaut
962	1	200ns			$chars{0xc920}=0x00a8; # [diaeresis]
963	1	100ns			$chars{0xc961}=0x00e4; # a with umlaut
964	1	200ns			$chars{0xc965}=0x00eb; # e with umlaut
965	1	200ns			$chars{0xc969}=0x00ef; # i with umlaut
966	1	100ns			$chars{0xc96f}=0x00f6; # o with umlaut
967	1	100ns			$chars{0xc975}=0x00fc; # u with umlaut
968					# 4/10 circle above
969	1	100ns			$chars{0xca41}=0x00c5; # capital a with ring above
970	1	200ns			$chars{0xcaad}=0x016e; # capital u with ring above
971	1	100ns			$chars{0xca61}=0x00e5; # small a with ring above
972	1	100ns			$chars{0xca75}=0x016f; # small u with ring above
973	1	100ns			$chars{0xca77}=0x1e98; # small w with ring above
974	1	100ns			$chars{0xca79}=0x1e99; # small y with ring above
975					# 4/11 high comma off centre
976					# 4/12 inverted high comma centred
977					# 4/13 double acute accent
978	1	100ns			$chars{0xcd4f}=0x0150; # capital o with double acute
979	1	200ns			$chars{0xcd55}=0x0170; # capital u with double acute
980	1	200ns			$chars{0xcd6f}=0x0151; # small o with double acute
981	1	100ns			$chars{0xcd75}=0x0171; # small u with double acute
982					# 4/14 horn
983	1	100ns			$chars{0xce54}=0x01a0; # latin capital letter o with horn
984	1	100ns			$chars{0xce55}=0x01af; # latin capital letter u with horn
985	1	100ns			$chars{0xce74}=0x01a1; # latin small letter o with horn
986	1	200ns			$chars{0xce75}=0x01b0; # latin small letter u with horn
987					# 4/15 caron (hacek
988	1	100ns			$chars{0xcf41}=0x01cd; # capital a with caron
989	1	200ns			$chars{0xcf43}=0x010c; # capital c with caron
990	1	100ns			$chars{0xcf44}=0x010e; # capital d with caron
991	1	200ns			$chars{0xcf45}=0x011a; # capital e with caron
992	1	100ns			$chars{0xcf47}=0x01e6; # capital g with caron
993	1	200ns			$chars{0xcf49}=0x01cf; # capital i with caron
994	1	100ns			$chars{0xcf4b}=0x01e8; # capital k with caron
995	1	200ns			$chars{0xcf4c}=0x013d; # capital l with caron
996	1	100ns			$chars{0xcf4e}=0x0147; # capital n with caron
997	1	200ns			$chars{0xcf4f}=0x01d1; # capital o with caron
998	1	100ns			$chars{0xcf52}=0x0158; # capital r with caron
999	1	200ns			$chars{0xcf53}=0x0160; # capital s with caron
1000	1	100ns			$chars{0xcf54}=0x0164; # capital t with caron
1001	1	200ns			$chars{0xcf55}=0x01d3; # capital u with caron
1002	1	100ns			$chars{0xcf5a}=0x017d; # capital z with caron
1003	1	200ns			$chars{0xcf61}=0x01ce; # small a with caron
1004	1	100ns			$chars{0xcf63}=0x010d; # small c with caron
1005	1	200ns			$chars{0xcf64}=0x010f; # small d with caron
1006	1	100ns			$chars{0xcf65}=0x011b; # small e with caron
1007	1	100ns			$chars{0xcf67}=0x01e7; # small g with caron
1008	1	100ns			$chars{0xcf69}=0x01d0; # small i with caron
1009	1	100ns			$chars{0xcf6a}=0x01f0; # small j with caron
1010	1	100ns			$chars{0xcf6b}=0x01e9; # small k with caron
1011	1	100ns			$chars{0xcf6c}=0x013e; # small l with caron
1012	1	100ns			$chars{0xcf6e}=0x0148; # small n with caron
1013	1	200ns			$chars{0xcf6f}=0x01d2; # small o with caron
1014	1	100ns			$chars{0xcf72}=0x0159; # small r with caron
1015	1	100ns			$chars{0xcf73}=0x0161; # small s with caron
1016	1	200ns			$chars{0xcf74}=0x0165; # small t with caron
1017	1	100ns			$chars{0xcf75}=0x01d4; # small u with caron
1018	1	200ns			$chars{0xcf7a}=0x017e; # small z with caron
1019					# 5/0 cedilla
1020	1	200ns			$chars{0xd020}=0x00b8; # cedilla
1021	1	100ns			$chars{0xd043}=0x00c7; # capital c with cedilla
1022	1	5µs			$chars{0xd044}=0x1e10; # capital d with cedilla
1023	1	200ns			$chars{0xd047}=0x0122; # capital g with cedilla
1024	1	200ns			$chars{0xd048}=0x1e28; # capital h with cedilla
1025	1	100ns			$chars{0xd04b}=0x0136; # capital k with cedilla
1026	1	200ns			$chars{0xd04c}=0x013b; # capital l with cedilla
1027	1	100ns			$chars{0xd04e}=0x0145; # capital n with cedilla
1028	1	200ns			$chars{0xd052}=0x0156; # capital r with cedilla
1029	1	200ns			$chars{0xd053}=0x015e; # capital s with cedilla
1030	1	100ns			$chars{0xd054}=0x0162; # capital t with cedilla
1031	1	200ns			$chars{0xd063}=0x00e7; # small c with cedilla
1032	1	200ns			$chars{0xd064}=0x1e11; # small d with cedilla
1033	1	100ns			$chars{0xd065}=0x0119; # small e with cedilla
1034	1	200ns			$chars{0xd067}=0x0123; # small g with cedilla
1035	1	100ns			$chars{0xd068}=0x1e29; # small h with cedilla
1036	1	200ns			$chars{0xd06b}=0x0137; # small k with cedilla
1037	1	200ns			$chars{0xd06c}=0x013c; # small l with cedilla
1038	1	200ns			$chars{0xd06e}=0x0146; # small n with cedilla
1039	1	100ns			$chars{0xd072}=0x0157; # small r with cedilla
1040	1	100ns			$chars{0xd073}=0x015f; # small s with cedilla
1041	1	100ns			$chars{0xd074}=0x0163; # small t with cedilla
1042					# 5/1 rude
1043					# 5/2 hook to left
1044					# 5/3 ogonek (hook to right
1045	1	100ns			$chars{0xd320}=0x02db; # ogonek
1046	1	100ns			$chars{0xd341}=0x0104; # capital a with ogonek
1047	1	100ns			$chars{0xd345}=0x0118; # capital e with ogonek
1048	1	200ns			$chars{0xd349}=0x012e; # capital i with ogonek
1049	1	100ns			$chars{0xd34f}=0x01ea; # capital o with ogonek
1050	1	200ns			$chars{0xd355}=0x0172; # capital u with ogonek
1051	1	200ns			$chars{0xd361}=0x0105; # small a with ogonek
1052	1	100ns			$chars{0xd365}=0x0119; # small e with ogonek
1053	1	200ns			$chars{0xd369}=0x012f; # small i with ogonek
1054	1	100ns			$chars{0xd36f}=0x01eb; # small o with ogonek
1055	1	200ns			$chars{0xd375}=0x0173; # small u with ogonek
1056					# 5/4 circle below
1057	1	100ns			$chars{0xd441}=0x1e00; # capital a with ring below
1058	1	200ns			$chars{0xd461}=0x1e01; # small a with ring below
1059					# 5/5 half circle below
1060	1	100ns			$chars{0xf948}=0x1e2a; # capital h with breve below
1061	1	100ns			$chars{0xf968}=0x1e2b; # small h with breve below
1062					# 5/6 dot below
1063	1	100ns			$chars{0xd641}=0x1ea0; # capital a with dot below
1064	1	200ns			$chars{0xd642}=0x1e04; # capital b with dot below
1065	1	100ns			$chars{0xd644}=0x1e0c; # capital d with dot below
1066	1	200ns			$chars{0xd645}=0x1eb8; # capital e with dot below
1067	1	100ns			$chars{0xd648}=0x1e24; # capital h with dot below
1068	1	100ns			$chars{0xd649}=0x1eca; # capital i with dot below
1069	1	200ns			$chars{0xd64b}=0x1e32; # capital k with dot below
1070	1	100ns			$chars{0xd64c}=0x1e36; # capital l with dot below
1071	1	200ns			$chars{0xd64d}=0x1e42; # capital m with dot below
1072	1	100ns			$chars{0xd64e}=0x1e46; # capital n with dot below
1073	1	200ns			$chars{0xd64f}=0x1ecc; # capital o with dot below
1074	1	100ns			$chars{0xd652}=0x1e5a; # capital r with dot below
1075	1	200ns			$chars{0xd653}=0x1e62; # capital s with dot below
1076	1	100ns			$chars{0xd654}=0x1e6c; # capital t with dot below
1077	1	200ns			$chars{0xd655}=0x1ee4; # capital u with dot below
1078	1	100ns			$chars{0xd656}=0x1e7e; # capital v with dot below
1079	1	100ns			$chars{0xd657}=0x1e88; # capital w with dot below
1080	1	200ns			$chars{0xd659}=0x1ef4; # capital y with dot below
1081	1	100ns			$chars{0xd65a}=0x1e92; # capital z with dot below
1082	1	100ns			$chars{0xd661}=0x1ea1; # small a with dot below
1083	1	100ns			$chars{0xd662}=0x1e05; # small b with dot below
1084	1	100ns			$chars{0xd664}=0x1e0d; # small d with dot below
1085	1	200ns			$chars{0xd665}=0x1eb9; # small e with dot below
1086	1	100ns			$chars{0xd668}=0x1e25; # small h with dot below
1087	1	100ns			$chars{0xd669}=0x1ecb; # small i with dot below
1088	1	100ns			$chars{0xd66b}=0x1e33; # small k with dot below
1089	1	100ns			$chars{0xd66c}=0x1e37; # small l with dot below
1090	1	100ns			$chars{0xd66d}=0x1e43; # small m with dot below
1091	1	200ns			$chars{0xd66e}=0x1e47; # small n with dot below
1092	1	100ns			$chars{0xd66f}=0x1ecd; # small o with dot below
1093	1	100ns			$chars{0xd672}=0x1e5b; # small r with dot below
1094	1	100ns			$chars{0xd673}=0x1e63; # small s with dot below
1095	1	200ns			$chars{0xd674}=0x1e6d; # small t with dot below
1096	1	100ns			$chars{0xd675}=0x1ee5; # small u with dot below
1097	1	100ns			$chars{0xd676}=0x1e7f; # small v with dot below
1098	1	100ns			$chars{0xd677}=0x1e89; # small w with dot below
1099	1	200ns			$chars{0xd679}=0x1ef5; # small y with dot below
1100	1	100ns			$chars{0xd67a}=0x1e93; # small z with dot below
1101					# 5/7 double dot below
1102	1	200ns			$chars{0xd755}=0x1e72; # capital u with diaeresis below
1103	1	100ns			$chars{0xd775}=0x1e73; # small u with diaeresis below
1104					# 5/8 underline
1105	1	200ns			$chars{0xd820}=0x005f; # underline
1106					# 5/9 double underline
1107	1	100ns			$chars{0xd920}=0x2017; # double underline
1108					# 5/10 small low vertical bar
1109	1	200ns			$chars{0xda20}=0x02cc; #
1110					# 5/11 circumflex below
1111					# 5/12 (this position shall not be used)
1112					# 5/13 left half of ligature sign and of double tilde
1113					# 5/14 right half of ligature sign
1114					# 5/15 right half of double tilde
1115					# map {printf "%x :%x\n",$_,$chars{$_};}keys %chars;
1116
1117					sub char_decode5426 {
1118					my ( $string) = @_;
1119					my $result;
1120
1121					my @data = unpack("C*", $string);
1122					my @characters;
1123					my $length=scalar(@data);
1124					for (my $i = 0; $i < scalar(@data); $i++) {
1125					my $char= $data[$i];
1126					if ($char >= 0x00 && $char <= 0x7F){
1127					#IsAscii
1128
1129					push @characters,$char unless ($char<0x02 \|\|$char== 0x0F);
1130					}elsif (($char >= 0xC0 && $char <= 0xDF)) {
1131					#Combined Char
1132					my $convchar ;
1133					if ($chars{$char*256+$data[$i+1]}) {
1134					$convchar= $chars{$char * 256 + $data[$i+1]};
1135					$i++;
1136					# printf "char %x $char, char to convert %x , converted %x\n",$char,$char * 256 + $data[$i - 1],$convchar;
1137					} elsif ($chars{$char}) {
1138					$convchar= $chars{$char};
1139					# printf "0xC char %x, converted %x\n",$char,$chars{$char};
1140					}else {
1141					$convchar=$char;
1142					}
1143					push @characters,$convchar;
1144					} else {
1145					my $convchar;
1146					if ($chars{$char}) {
1147					$convchar= $chars{$char};
1148					# printf "char %x, converted %x\n",$char,$chars{$char};
1149					}else {
1150					# printf "char %x $char\n",$char;
1151					$convchar=$char;
1152					}
1153					push @characters,$convchar;
1154					}
1155					}
1156					$result=pack "U*",@characters;
1157					# $result=~s/\x01//;
1158					# $result=~s/\x00//;
1159					$result=~s/\x0f//;
1160					$result=~s/\x1b.//;
1161					$result=~s/\x0e//;
1162					$result=~s/\x1b\x5b//;
1163					# map{printf "%x",$_} @characters;
1164					# printf "\n";
1165					return $result;
1166					}
1167
1168	1	45µs			1;
1169
1170
1171					=head1 AUTHOR
1172
1173					Koha Development Team <http://koha-community.org/>
1174
1175					Galen Charlton <galen.charlton@liblime.com>
1176
1177					=cut