← Index
NYTProf Performance Profile   « block view • line view • sub view »
For /usr/share/koha/opac/cgi-bin/opac/opac-search.pl
  Run on Tue Oct 15 11:58:52 2013
Reported on Tue Oct 15 12:01:06 2013

Filename/usr/share/perl5/MARC/Charset.pm
StatementsExecuted 34 statements in 2.29ms
Subroutines
Calls P F Exclusive
Time
Inclusive
Time
Subroutine
1111.73ms3.79msMARC::Charset::::BEGIN@12MARC::Charset::BEGIN@12
1111.02ms2.11msMARC::Charset::::BEGIN@10MARC::Charset::BEGIN@10
111726µs9.88msMARC::Charset::::BEGIN@13MARC::Charset::BEGIN@13
11123µs36µsMARC::Charset::::BEGIN@5MARC::Charset::BEGIN@5
11118µs224µsMARC::Charset::::BEGIN@11MARC::Charset::BEGIN@11
11117µs22µsMARC::Charset::::BEGIN@4MARC::Charset::BEGIN@4
11111µs277µsMARC::Charset::::BEGIN@14MARC::Charset::BEGIN@14
11110µs64µsMARC::Charset::::BEGIN@7MARC::Charset::BEGIN@7
0000s0sMARC::Charset::::_process_escapeMARC::Charset::_process_escape
0000s0sMARC::Charset::::assume_encodingMARC::Charset::assume_encoding
0000s0sMARC::Charset::::assume_unicodeMARC::Charset::assume_unicode
0000s0sMARC::Charset::::ignore_errorsMARC::Charset::ignore_errors
0000s0sMARC::Charset::::marc8_to_utf8MARC::Charset::marc8_to_utf8
0000s0sMARC::Charset::::reset_charsetsMARC::Charset::reset_charsets
0000s0sMARC::Charset::::utf8_to_marc8MARC::Charset::utf8_to_marc8
Call graph for these subroutines as a Graphviz dot language file.
Line State
ments
Time
on line
Calls Time
in subs
Code
1package MARC::Charset;
2
31600nsour $VERSION = '1.2';
4328µs226µs
# spent 22µs (17+4) within MARC::Charset::BEGIN@4 which was called: # once (17µs+4µs) by MARC::File::SAX::BEGIN@14 at line 4
use strict;
# spent 22µs making 1 call to MARC::Charset::BEGIN@4 # spent 4µs making 1 call to strict::import
5331µs248µs
# spent 36µs (23+12) within MARC::Charset::BEGIN@5 which was called: # once (23µs+12µs) by MARC::File::SAX::BEGIN@14 at line 5
use warnings;
# spent 36µs making 1 call to MARC::Charset::BEGIN@5 # spent 12µs making 1 call to warnings::import
6
7347µs2118µs
# spent 64µs (10+54) within MARC::Charset::BEGIN@7 which was called: # once (10µs+54µs) by MARC::File::SAX::BEGIN@14 at line 7
use base qw(Exporter);
# spent 64µs making 1 call to MARC::Charset::BEGIN@7 # spent 54µs making 1 call to base::import
811µsour @EXPORT_OK = qw(marc8_to_utf8 utf8_to_marc8);
9
103356µs22.17ms
# spent 2.11ms (1.02+1.10) within MARC::Charset::BEGIN@10 which was called: # once (1.02ms+1.10ms) by MARC::File::SAX::BEGIN@14 at line 10
use Unicode::Normalize;
# spent 2.11ms making 1 call to MARC::Charset::BEGIN@10 # spent 58µs making 1 call to Exporter::import
11344µs2430µs
# spent 224µs (18+206) within MARC::Charset::BEGIN@11 which was called: # once (18µs+206µs) by MARC::File::SAX::BEGIN@14 at line 11
use Encode 'decode';
# spent 224µs making 1 call to MARC::Charset::BEGIN@11 # spent 206µs making 1 call to Exporter::import
123143µs25.73ms
# spent 3.79ms (1.73+2.06) within MARC::Charset::BEGIN@12 which was called: # once (1.73ms+2.06ms) by MARC::File::SAX::BEGIN@14 at line 12
use charnames ':full';
# spent 3.79ms making 1 call to MARC::Charset::BEGIN@12 # spent 1.94ms making 1 call to charnames::import
133155µs19.88ms
# spent 9.88ms (726µs+9.15) within MARC::Charset::BEGIN@13 which was called: # once (726µs+9.15ms) by MARC::File::SAX::BEGIN@14 at line 13
use MARC::Charset::Table;
# spent 9.88ms making 1 call to MARC::Charset::BEGIN@13
1431.47ms2543µs
# spent 277µs (11+266) within MARC::Charset::BEGIN@14 which was called: # once (11µs+266µs) by MARC::File::SAX::BEGIN@14 at line 14
use MARC::Charset::Constants qw(:all);
# spent 277µs making 1 call to MARC::Charset::BEGIN@14 # spent 266µs making 1 call to Exporter::import
15
16=head1 NAME
17
- -
43# get the mapping table
4413µs1184µsour $table = MARC::Charset::Table->new();
# spent 184µs making 1 call to MARC::Charset::Table::new
45
46# set default character sets
47# these are viewable at the package level
48# in case someone wants to set them
491600nsour $DEFAULT_G0 = ASCII_DEFAULT;
501400nsour $DEFAULT_G1 = EXTENDED_LATIN;
51
52=head2 ignore_errors()
53
- -
661400nsour $_ignore_errors = 0;
67sub ignore_errors {
68 my ($self,$i) = @_;
69 $_ignore_errors = $i if (defined($i));
70 return $_ignore_errors;
71}
72
73
74=head2 assume_unicode()
75
- -
891300nsour $_assume = '';
90sub assume_unicode {
91 my ($self,$i) = @_;
92 $_assume = 'utf8' if (defined($i) and $i);
93 return 1 if ($_assume eq 'utf8');
94}
95
96
97=head2 assume_encoding()
98
- -
111sub assume_encoding {
112 my ($self,$i) = @_;
113 $_assume = $i if (defined($i));
114 return $_assume;
115}
116
117
118# place holders for working graphical character sets
1191300nsmy $G0;
1201200nsmy $G1;
121
122=head2 marc8_to_utf8()
123
- -
142sub marc8_to_utf8
143{
144 my ($marc8, $ignore_errors) = @_;
145 reset_charsets();
146
147 $ignore_errors = $_ignore_errors if (!defined($ignore_errors));
148
149 # holder for our utf8
150 my $utf8 = '';
151
152 my $index = 0;
153 my $length = length($marc8);
154 my $combining = '';
155 CHAR_LOOP: while ($index < $length)
156 {
157 # whitespace, line feeds and carriage returns just get added on unmolested
158 if (substr($marc8, $index, 1) =~ m/(\s+|\x0A+|\x0D+)/so)
159 {
160 $utf8 .= $1;
161 $index += 1;
162 next CHAR_LOOP;
163 }
164
165 # look for any escape sequences
166 my $new_index = _process_escape(\$marc8, $index, $length);
167 if ($new_index > $index)
168 {
169 $index = $new_index;
170 next CHAR_LOOP;
171 }
172
173 my $found;
174 CHARSET_LOOP: foreach my $charset ($G0, $G1)
175 {
176
177 # cjk characters are a string of three chars
178 my $char_size = $charset eq CJK ? 3 : 1;
179
180 # extract the next code point to examine
181 my $chunk = substr($marc8, $index, $char_size);
182
183 # look up the character to see if it's in our mapping
184 my $code = $table->lookup_by_marc8($charset, $chunk);
185
186 # try the next character set if no mapping was found
187 next CHARSET_LOOP if ! $code;
188 $found = 1;
189
190 # gobble up all combining characters for appending later
191 # this is necessary because combinging characters precede
192 # the character they modifiy in MARC-8, whereas they follow
193 # the character they modify in UTF-8.
194 if ($code->is_combining())
195 {
196 $combining .= $code->char_value();
197 }
198 else
199 {
200 $utf8 .= $code->char_value() . $combining;
201 $combining = '';
202 }
203
204 $index += $char_size;
205 next CHAR_LOOP;
206 }
207
208 if (!$found)
209 {
210 warn(sprintf("no mapping found for [0x\%X] at position $index in $marc8 ".
211 "g0=".MARC::Charset::Constants::charset_name($G0) . " " .
212 "g1=".MARC::Charset::Constants::charset_name($G1), unpack('C',substr($marc8,$index,1))));
213 if (!$ignore_errors)
214 {
215 reset_charsets();
216 return;
217 }
218 if ($_assume)
219 {
220 reset_charsets();
221 return NFC(decode($_assume => $marc8));
222 }
223 $index += 1;
224 }
225
226 }
227
228 # return the utf8
229 reset_charsets();
230 return $utf8;
231}
232
- -
235=head2 utf8_to_marc8()
236
- -
254sub utf8_to_marc8
255{
256 my ($utf8, $ignore_errors) = @_;
257 reset_charsets();
258
259 $ignore_errors = $_ignore_errors if (!defined($ignore_errors));
260
261 # decompose combined characters
262 $utf8 = NFD($utf8);
263
264 my $len = length($utf8);
265 my $marc8 = '';
266 for (my $i=0; $i<$len; $i++)
267 {
268 my $slice = substr($utf8, $i, 1);
269
270 # spaces are copied from utf8 into marc8
271 if ($slice eq ' ')
272 {
273 $marc8 .= ' ';
274 next;
275 }
276
277 # try to find the code point in our mapping table
278 my $code = $table->lookup_by_utf8($slice);
279
280 if (! $code)
281 {
282 warn("no mapping found at position $i in $utf8");
283 reset_charsets() and return unless $ignore_errors;
284 }
285
286 # if it's a combining character move it around
287 if ($code->is_combining())
288 {
289 my $prev = chop($marc8);
290 $marc8 .= $code->marc_value() . $prev;
291 next;
292 }
293
294 # look to see if we need to escape to a new G0 charset
295 my $charset_value = $code->charset_value();
296
297 if ($code->default_charset_group() eq 'G0'
298 and $G0 ne $charset_value)
299 {
300 if ($G0 eq ASCII_DEFAULT and $charset_value eq BASIC_LATIN)
301 {
302 # don't bother escaping, they're functionally the same
303 }
304 else
305 {
306 $marc8 .= $code->get_escape();
307 $G0 = $charset_value;
308 }
309 }
310
311 # look to see if we need to escape to a new G1 charset
312 elsif ($code->default_charset_group() eq 'G1'
313 and $G1 ne $charset_value)
314 {
315 $marc8 .= $code->get_escape();
316 $G1 = $charset_value;
317 }
318
319 $marc8 .= $code->marc_value();
320 }
321
322 # escape back to default G0 if necessary
323 if ($G0 ne $DEFAULT_G0)
324 {
325 if ($DEFAULT_G0 eq ASCII_DEFAULT) { $marc8 .= ESCAPE . ASCII_DEFAULT; }
326 elsif ($DEFAULT_G0 eq CJK) { $marc8 .= ESCAPE . MULTI_G0_A . CJK; }
327 else { $marc8 .= ESCAPE . SINGLE_G0_A . $DEFAULT_G0; }
328 }
329
330 # escape back to default G1 if necessary
331 if ($G1 ne $DEFAULT_G1)
332 {
333 if ($DEFAULT_G1 eq CJK) { $marc8 .= ESCAPE . MULTI_G1_A . $DEFAULT_G1; }
334 else { $marc8 .= ESCAPE . SINGLE_G1_A . $DEFAULT_G1; }
335 }
336
337 return $marc8;
338}
339
- -
342=head1 DEFAULT CHARACTER SETS
343
- -
377sub _process_escape
378{
379 ## this stuff is kind of scary ... for an explanation of what is
380 ## going on here check out the MARC-8 specs at LC.
381 ## http://lcweb.loc.gov/marc/specifications/speccharmarc8.html
382 my ($str_ref, $left, $right) = @_;
383
384 # first char needs to be an escape or else this isn't an escape sequence
385 return $left unless substr($$str_ref, $left, 1) eq ESCAPE;
386
387 ## if we don't have at least one character after the escape
388 ## then this can't be a character escape sequence
389 return $left if ($left+1 >= $right);
390
391 ## pull off the first escape
392 my $esc_char_1 = substr($$str_ref, $left+1, 1);
393
394 ## the first method of escaping to small character sets
395 if ( $esc_char_1 eq GREEK_SYMBOLS
396 or $esc_char_1 eq SUBSCRIPTS
397 or $esc_char_1 eq SUPERSCRIPTS
398 or $esc_char_1 eq ASCII_DEFAULT)
399 {
400 $G0 = $esc_char_1;
401 return $left+2;
402 }
403
404 ## the second more complicated method of escaping to bigger charsets
405 return $left if $left+2 >= $right;
406
407 my $esc_char_2 = substr($$str_ref, $left+2, 1);
408 my $esc_chars = $esc_char_1 . $esc_char_2;
409
410 if ($esc_char_1 eq SINGLE_G0_A
411 or $esc_char_1 eq SINGLE_G0_B)
412 {
413 $G0 = $esc_char_2;
414 return $left+3;
415 }
416
417 elsif ($esc_char_1 eq SINGLE_G1_A
418 or $esc_char_1 eq SINGLE_G1_B)
419 {
420 $G1 = $esc_char_2;
421 return $left+3;
422 }
423
424 elsif ( $esc_char_1 eq MULTI_G0_A ) {
425 $G0 = $esc_char_2;
426 return $left+3;
427 }
428
429 elsif ($esc_chars eq MULTI_G0_B
430 and ($left+3 < $right))
431 {
432 $G0 = substr($$str_ref, $left+3, 1);
433 return $left+4;
434 }
435
436 elsif (($esc_chars eq MULTI_G1_A or $esc_chars eq MULTI_G1_B)
437 and ($left + 3 < $right))
438 {
439 $G1 = substr($$str_ref, $left+3, 1);
440 return $left+4;
441 }
442
443 # we should never get here
444 warn("seem to have fallen through in _process_escape()");
445 return $left;
446}
447
448sub reset_charsets
449{
450 $G0 = $DEFAULT_G0;
451 $G1 = $DEFAULT_G1;
452}
453
45418µs1;