CoCalc Public Fileswww / cgi-bin / openwebmail / misc / test / jcode.plOpen with one click!
Author: William A. Stein
1
#!/usr/bin/perl
2
package jcode;
3
######################################################################
4
#
5
# jcode.pl: `Perl5' library for Japanese character code conversion
6
#
7
# Modified by Nobuchika Oishi <[email protected]>
8
#
9
# Original jcode.pl version 2.13 by Kazumasa Utashiro.
10
#
11
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
12
# ANY EXPRESS OR IMPLIED WARRANTIES ARE DISCLAIMED.
13
#
14
# ver 2.13.1 : updated last on 2001/09/24
15
#
16
######################################################################
17
#
18
# jcode.pl - original copyright -
19
#
20
# Copyright (c) 1995-2000 Kazumasa Utashiro <[email protected]>
21
# Internet Initiative Japan Inc.
22
# 3-13 Kanda Nishiki-cho, Chiyoda-ku, Tokyo 101-0054, Japan
23
#
24
# Copyright (c) 1992,1993,1994 Kazumasa Utashiro
25
# Software Research Associates, Inc.
26
#
27
# Use and redistribution for ANY PURPOSE are granted as long as all
28
# copyright notices are retained. Redistribution with modification
29
# is allowed provided that you make your modified version obviously
30
# distinguishable from the original one. THIS SOFTWARE IS PROVIDED
31
# BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES ARE
32
# DISCLAIMED.
33
#
34
# Original version was developed under the name of [email protected]
35
# February 1992 and it was called kconv.pl at the beginning. This
36
# address was a pen name for group of individuals and it is no longer
37
# valid.
38
#
39
# The latest version of `original jcode.pl' is available here:
40
#
41
# ftp://ftp.iij.ad.jp/pub/IIJ/dist/utashiro/perl/
42
#
43
######################################################################
44
#
45
# PERL5 INTERFACE:
46
#
47
# jcode::getcode(\$line)
48
# Return 'jis', 'sjis', 'euc', 'utf8', 'ucs2' or undef
49
# according to Japanese character code in $line.
50
# Return 'binary' if the data has non-character code.
51
#
52
# When evaluated in array context, it returns a list
53
# contains two items. First value is the number of
54
# characters which matched to the expected code, and
55
# second value is the code name. It is useful if and
56
# only if the number is not 0 and the code is undef;
57
# that case means it couldn't tell 'euc' or 'sjis' or
58
# 'utf8' because the evaluation score was exactly same.
59
# This interface is too tricky, though.
60
#
61
# Code detection between euc and sjis is very difficult
62
# or sometimes impossible or even lead to wrong result
63
# when it includes JIS X0201 KANA characters. So JIS
64
# X0201 KANA is ignored for automatic code detection.
65
#
66
# jcode::convert(\$line, $ocode [, $icode [, $option]])
67
# Convert the contents of $line to the specified
68
# Japanese code given in the second argument $ocode.
69
# $ocode can be any of "jis", "sjis" or "euc", or use
70
# "noconv" when you don't want the code conversion.
71
# Input code is recognized automatically from the line
72
# itself when $icode is not supplied (JIS X0201 KANA is
73
# ignored in code detection. See the above descripton
74
# of &getcode). $icode also can be specified, but
75
# xxx2yyy routine is more efficient when both codes are
76
# known.
77
#
78
# It returns the code of input string in scalar context,
79
# and a list of pointer of convert subroutine and the
80
# input code in array context.
81
#
82
# Japanese character code JIS X0201, X0208, X0212 and
83
# ASCII code are supported. X0212 characters can not be
84
# represented in SJIS and they will be replased by
85
# "geta" character when converted to SJIS.
86
#
87
# See next paragraph for $option parameter.
88
#
89
# jcode::xxx2yyy(\$line [, $option])
90
# Convert the Japanese code from xxx to yyy. String xxx
91
# and yyy are any convination from "jis", "euc" or
92
# "sjis". They return *approximate* number of converted
93
# bytes. So return value 0 means the line was not
94
# converted at all.
95
#
96
# Optional parameter $option is used to specify optional
97
# conversion method. String "z" is for JIS X0201 KANA
98
# to X0208 KANA, and "h" is for reverse.
99
#
100
# $jcode::convf{'xxx', 'yyy'}
101
# The value of this associative array is reference to the
102
# subroutine jcode::xxx2yyy().
103
#
104
# jcode::to($ocode, $line [, $icode [, $option]])
105
# jcode::jis($line [, $icode [, $option]])
106
# jcode::euc($line [, $icode [, $option]])
107
# jcode::sjis($line [, $icode [, $option]])
108
# These functions are prepared for easy use of
109
# call/return-by-value interface. You can use these
110
# funcitons in s///e operation or any other place for
111
# convenience.
112
#
113
# jcode::jis_inout($in, $out)
114
# Set or inquire JIS start and end sequences. Default
115
# is "ESC-$-B" and "ESC-(-B". If you supplied only one
116
# character, "ESC-$" or "ESC-(" is prepended for each
117
# character respectively. Acutually "ESC-(-B" is not a
118
# sequence to end JIS code but a sequence to start ASCII
119
# code set. So `in' and `out' are somewhat misleading.
120
#
121
# jcode::get_inout($string)
122
# Get JIS start and end sequences from $string.
123
#
124
# jcode::cache()
125
# jcode::nocache()
126
# jcode::flush()
127
# Usually, converted character is cached in memory to
128
# avoid same calculations have to be done many times.
129
# To disable this caching, call jcode::nocache(). It
130
# can be revived by jcode::cache() and cache is flushed
131
# by calling jcode::flush(). cache() and &nocache()
132
# functions return previous caching state.
133
#
134
# ---------------------------------------------------------------
135
#
136
# jcode::h2z_xxx(\$line)
137
# JIS X0201 KANA (so-called Hankaku-KANA) to X0208 KANA
138
# (Zenkaku-KANA) code conversion routine. String xxx is
139
# any of "jis", "sjis" and "euc". From the difficulty
140
# of recognizing code set from 1-byte KATAKANA string,
141
# automatic code recognition is not supported.
142
#
143
# jcode::z2h_xxx(\$line)
144
# X0208 to X0201 KANA code conversion routine. String
145
# xxx is any of "jis", "sjis" and "euc".
146
#
147
# jcode::z2hf{'xxx'}
148
# jcode::h2zf{'xxx'}
149
# These are reference to the corresponding function just
150
# as $jcode::convf.
151
#
152
# ---------------------------------------------------------------
153
#
154
# jcode::tr(\$line, $from, $to [, $option])
155
# jcode::tr emulates tr operator for 2 byte code. Only 'd'
156
# is interpreted as an option.
157
#
158
# Range operator like `A-Z' for 2 byte code is partially
159
# supported. Code must be JIS or EUC or SJIS, and first
160
# byte have to be same on first and last character.
161
#
162
# CAUTION: Handling range operator is a kind of trick
163
# and it is not perfect. So if you need to transfer `-'
164
# character, please be sure to put it at the beginning
165
# or the end of $from and $to strings.
166
#
167
# jcode::trans($line, $from, $to [, $option)
168
# Same as jcode::tr but accept string and return string
169
# after translation.
170
#
171
# ---------------------------------------------------------------
172
#
173
# jcode::init()
174
# Initialize the variables used in this package. You
175
# don't have to call this when using jocde.pl by `do' or
176
# `require' interface. Call it first if you embedded
177
# the jcode.pl at the end of your script.
178
#
179
######################################################################
180
181
#
182
# Call initialize function if it is not called yet. This may sound
183
# strange but it makes easy to embed the jcode.pl at the end of
184
# script. Call jcode::init() at the beginning of the script in that
185
# case.
186
#
187
188
init() unless defined $version;
189
190
#
191
# Initialize variables.
192
#
193
sub init {
194
$version = '2.13.1';
195
196
$re_bin = '[\000-\006\177\377]';
197
$re_utf8 = '[\300-\337][\200-\277]|[\340-\357][\200-\277][\200-\277]';
198
199
$re_jis0208_1978 = '\e\$\@';
200
$re_jis0208_1983 = '\e\$B';
201
$re_jis0208_1990 = '\e&\@\e\$B';
202
203
$re_jis0208 = "$re_jis0208_1978|$re_jis0208_1983|$re_jis0208_1990";
204
$re_jis0212 = '\e\$\(D';
205
$re_jp = "$re_jis0208|$re_jis0212";
206
$re_asc = '\e\([BJ]';
207
$re_kana = '\e\(I';
208
209
$esc_0208 = "\e\$B";
210
$esc_0212 = "\e\$(D";
211
$esc_asc = "\e(B";
212
$esc_kana = "\e(I";
213
214
$re_sjis_c = '[\201-\237\340-\374][\100-\176\200-\374]';
215
$re_sjis_kana = '[\241-\337]';
216
$re_euc_c = '[\241-\376][\241-\376]';
217
$re_euc_kana = '\216[\241-\337]';
218
$re_euc_0212 = '\217[\241-\376][\241-\376]';
219
$undef_sjis = "\x81\xac";
220
221
$cache = 1;
222
223
# X0201 -> X0208 KANA conversion table. Looks weird? Not that
224
# much. This is simply JIS text without escape sequences.
225
my($h2z_high, $h2z);
226
($h2z_high = $h2z = <<'__TABLE_END__') =~ tr/\041-\176/\241-\376/;
227
! !# $ !" % !& " !V # !W
228
^ !+ _ !, 0 !<
229
' %! ( %# ) %% * %' + %)
230
, %c - %e . %g / %C
231
1 %" 2 %$ 3 %& 4 %( 5 %*
232
6 %+ 7 %- 8 %/ 9 %1 : %3
233
6^ %, 7^ %. 8^ %0 9^ %2 :^ %4
234
; %5 < %7 = %9 > %; ? %=
235
;^ %6 <^ %8 =^ %: >^ %< ?^ %>
236
@ %? A %A B %D C %F D %H
237
@^ %@ A^ %B B^ %E C^ %G D^ %I
238
E %J F %K G %L H %M I %N
239
J %O K %R L %U M %X N %[
240
J^ %P K^ %S L^ %V M^ %Y N^ %\
241
J_ %Q K_ %T L_ %W M_ %Z N_ %]
242
O %^ P %_ Q %` R %a S %b
243
T %d U %f V %h
244
W %i X %j Y %k Z %l [ %m
245
\ %o ] %s & %r 3^ %t
246
__TABLE_END__
247
%h2z = split(/\s+/, $h2z . $h2z_high);
248
%z2h = reverse %h2z;
249
250
$convf{'jis' , 'jis' } = \&jis2jis;
251
$convf{'jis' , 'sjis'} = \&jis2sjis;
252
$convf{'jis' , 'euc' } = \&jis2euc;
253
$convf{'euc' , 'jis' } = \&euc2jis;
254
$convf{'euc' , 'sjis'} = \&euc2sjis;
255
$convf{'euc' , 'euc' } = \&euc2euc;
256
$convf{'sjis', 'jis' } = \&sjis2jis;
257
$convf{'sjis', 'sjis'} = \&sjis2sjis;
258
$convf{'sjis', 'euc' } = \&sjis2euc;
259
$h2zf{'jis'} = \&h2z_jis;
260
$z2hf{'jis'} = \&z2h_jis;
261
$h2zf{'euc'} = \&h2z_euc;
262
$z2hf{'euc'} = \&z2h_euc;
263
$h2zf{'sjis'} = \&h2z_sjis;
264
$z2hf{'sjis'} = \&z2h_sjis;
265
}
266
267
#
268
# Set escape sequences which should be put before and after Japanese
269
# (JIS X0208) string.
270
#
271
sub jis_inout {
272
$esc_0208 = shift || $esc_0208;
273
$esc_0208 = "\e\$$esc_0208" if length($esc_0208) == 1;
274
$esc_asc = shift || $esc_asc;
275
$esc_asc = "\e\($esc_asc" if length($esc_asc) == 1;
276
($esc_0208, $esc_asc);
277
}
278
279
#
280
# Get JIS in and out sequences from the string.
281
#
282
sub get_inout {
283
my($esc_0208, $esc_asc);
284
$_[$[] =~ /($re_jis0208)/o && ($esc_0208 = $1);
285
$_[$[] =~ /($re_asc)/o && ($esc_asc = $1);
286
($esc_0208, $esc_asc);
287
}
288
289
#
290
# Recognize character code.
291
#
292
sub getcode {
293
my $s = shift;
294
my($matched, $code);
295
if($$s =~ /$re_bin/o) { # 'binary' or 'ucs2'
296
my $ucs2 = 0;
297
while ($$s =~ /((?:\000[\000-\177])+)/go){
298
$ucs2 += length($1);
299
}
300
if($ucs2){
301
($code, $matched) = ('ucs2', $ucs2);
302
} else {
303
($code, $matched) = ('binary', 0);
304
}
305
} elsif($$s !~ /[\e\200-\377]/o) { # not Japanese
306
$matched = 0;
307
$code = undef;
308
} elsif($$s =~ /$re_jp|$re_asc|$re_kana/o) { # 'jis'
309
$matched = 1;
310
$code = 'jis';
311
} else { # 'euc' or 'sjis' or 'utf8'
312
my($sjis, $euc, $utf8) = (0, 0, 0);
313
while ($$s =~ /((?:$re_sjis_c)+)/go) {
314
$sjis += length($1);
315
}
316
while ($$s =~ /((?:$re_euc_c|$re_euc_kana|$re_euc_0212)+)/go) {
317
$euc += length($1);
318
}
319
while ($$s =~ /((?:$re_utf8)+)/go){
320
$utf8 += length($1);
321
}
322
$matched = _max($sjis, $euc, $utf8);
323
$code = ($sjis > $euc and $sjis > $utf8) ? 'sjis' :
324
($euc > $sjis and $euc > $utf8) ? 'euc' :
325
($utf8 > $euc and $utf8 > $sjis) ? 'utf8' : undef;
326
}
327
wantarray ? ($matched, $code) : $code;
328
}
329
330
sub _max {
331
my $c = shift;
332
for my $n (@_){
333
$c = $n if $n > $c;
334
}
335
$c;
336
}
337
338
#
339
# Convert any code to specified code.
340
#
341
sub convert {
342
my($s, $ocode, $icode, $opt) = @_;
343
$icode ||= getcode($s);
344
$ocode = 'jis' unless $ocode;
345
$ocode = $icode if $ocode eq 'noconv';
346
my $f = $convf{$icode, $ocode};
347
&$f($s, $opt) if ref $f;
348
wantarray ? ($f, $icode) : $icode;
349
}
350
351
#
352
# Easy return-by-value interfaces.
353
#
354
sub jis { to('jis', @_); }
355
sub euc { to('euc', @_); }
356
sub sjis { to('sjis', @_); }
357
sub to {
358
my($ocode, $s, $icode, $opt) = @_;
359
convert(\$s, $ocode, $icode, $opt);
360
$s;
361
}
362
sub what {
363
my $s = shift;
364
getcode(\$s);
365
}
366
sub trans {
367
my $s = shift;
368
&tr(\$s, @_);
369
$s;
370
}
371
372
#
373
# SJIS to JIS
374
#
375
sub sjis2jis {
376
my($s, $opt) = @_;
377
local($n) = 0;
378
sjis2sjis($s, $opt) if $opt;
379
$$s =~ s/(($re_sjis_c|$re_sjis_kana)+)/_sjis2jis($1) . $esc_asc/geo;
380
$n;
381
}
382
sub _sjis2jis {
383
my $s = shift;
384
$s =~ s/(($re_sjis_c)+|($re_sjis_kana)+)/__sjis2jis($1)/geo;
385
$s;
386
}
387
sub __sjis2jis {
388
my $s = shift;
389
if($s =~ /^$re_sjis_kana/o){
390
$n += $s =~ tr/\241-\337/\041-\137/;
391
$esc_kana . $s;
392
} else {
393
$n += $s =~ s/($re_sjis_c)/$s2e{$1} || s2e($1)/geo;
394
$s =~ tr/\241-\376/\041-\176/;
395
$esc_0208 . $s;
396
}
397
}
398
399
#
400
# EUC to JIS
401
#
402
sub euc2jis {
403
my($s, $opt) = @_;
404
local($n) = 0;
405
euc2euc($s, $opt) if $opt;
406
$$s =~ s/(($re_euc_c|$re_euc_kana|$re_euc_0212)+)/_euc2jis($1) . $esc_asc/geo;
407
$n;
408
}
409
sub _euc2jis {
410
my $s = shift;
411
$s =~ s/(($re_euc_c)+|($re_euc_kana)+|($re_euc_0212)+)/__euc2jis($1)/geo;
412
$s;
413
}
414
sub __euc2jis {
415
my $s = shift;
416
my($esc);
417
if($s =~ tr/\216//d){
418
$esc = $esc_kana;
419
} elsif($s =~ tr/\217//d){
420
$esc = $esc_0212;
421
} else {
422
$esc = $esc_0208;
423
}
424
$n += $s =~ tr/\241-\376/\041-\176/;
425
$esc . $s;
426
}
427
428
#
429
# JIS to EUC
430
#
431
sub jis2euc {
432
my($s, $opt) = @_;
433
local($n) = 0;
434
$$s =~ s/($re_jp|$re_asc|$re_kana)([^\e]*)/_jis2euc($1, $2)/geo;
435
euc2euc($s, $opt) if $opt;
436
$n;
437
}
438
sub _jis2euc {
439
my($esc, $s) = @_;
440
if($esc !~ /^$re_asc/o){
441
$n += $s =~ tr/\041-\176/\241-\376/;
442
if($esc =~ /^$re_kana/o){
443
$s =~ s/([\241-\337])/\216$1/go;
444
} elsif($esc =~ /^$re_jis0212/o){
445
$s =~ s/([\241-\376][\241-\376])/\217$1/go;
446
}
447
}
448
$s;
449
}
450
451
#
452
# JIS to SJIS
453
#
454
sub jis2sjis {
455
my($s, $opt) = @_;
456
local($n) = 0;
457
jis2jis($s, $opt) if $opt;
458
$$s =~ s/($re_jp|$re_asc|$re_kana)([^\e]*)/_jis2sjis($1, $2)/geo;
459
$n;
460
}
461
sub _jis2sjis {
462
my($esc, $s) = @_;
463
if($esc =~ /^$re_jis0212/o){
464
$s =~ s/../$undef_sjis/go;
465
$n = length;
466
} elsif($esc !~ /^$re_asc/o){
467
$n += $s =~ tr/\041-\176/\241-\376/;
468
if($esc =~ /^$re_jp/o){
469
$s =~ s/($re_euc_c)/$e2s{$1} || e2s($1)/geo;
470
}
471
}
472
$s;
473
}
474
475
#
476
# SJIS to EUC
477
#
478
sub sjis2euc {
479
my($s, $opt) = @_;
480
my $n = $$s =~ s/($re_sjis_c|$re_sjis_kana)/$s2e{$1} || s2e($1)/geo;
481
euc2euc($s, $opt) if $opt;
482
$n;
483
}
484
sub s2e {
485
my($c1, $c2, $code);
486
($c1, $c2) = unpack('CC', $code = shift);
487
if(0xa1 <= $c1 && $c1 <= 0xdf){
488
$c2 = $c1;
489
$c1 = 0x8e;
490
} elsif(0x9f <= $c2){
491
$c1 = $c1 * 2 - ($c1 >= 0xe0 ? 0xe0 : 0x60);
492
$c2 += 2;
493
} else {
494
$c1 = $c1 * 2 - ($c1 >= 0xe0 ? 0xe1 : 0x61);
495
$c2 += 0x60 + ($c2 < 0x7f);
496
}
497
if($cache){
498
$s2e{$code} = pack('CC', $c1, $c2);
499
} else {
500
pack('CC', $c1, $c2);
501
}
502
}
503
504
#
505
# EUC to SJIS
506
#
507
sub euc2sjis {
508
my($s, $opt) = @_;
509
euc2euc($s, $opt) if $opt;
510
my $n = $$s =~ s/($re_euc_c|$re_euc_kana|$re_euc_0212)/$e2s{$1} || e2s($1)/geo;
511
}
512
sub e2s {
513
my($c1, $c2, $code);
514
($c1, $c2) = unpack('CC', $code = shift);
515
if($c1 == 0x8e){ # SS2
516
return substr($code, 1, 1);
517
} elsif($c1 == 0x8f){ # SS3
518
return $undef_sjis;
519
} elsif($c1 % 2){
520
$c1 = ($c1>>1) + ($c1 < 0xdf ? 0x31 : 0x71);
521
$c2 -= 0x60 + ($c2 < 0xe0);
522
} else {
523
$c1 = ($c1>>1) + ($c1 < 0xdf ? 0x30 : 0x70);
524
$c2 -= 2;
525
}
526
if($cache){
527
$e2s{$code} = pack('CC', $c1, $c2);
528
} else {
529
pack('CC', $c1, $c2);
530
}
531
}
532
533
#
534
# JIS to JIS, SJIS to SJIS, EUC to EUC
535
#
536
sub jis2jis {
537
my($s, $opt) = @_;
538
$$s =~ s/$re_jis0208/$esc_0208/go;
539
$$s =~ s/$re_asc/$esc_asc/go;
540
h2z_jis($s) if $opt =~ /z/o;
541
z2h_jis($s) if $opt =~ /h/o;
542
}
543
sub sjis2sjis {
544
my($s, $opt) = @_;
545
h2z_sjis($s) if $opt =~ /z/o;
546
z2h_sjis($s) if $opt =~ /h/o;
547
}
548
sub euc2euc {
549
my($s, $opt) = @_;
550
h2z_euc($s) if $opt =~ /z/o;
551
z2h_euc($s) if $opt =~ /h/o;
552
}
553
554
#
555
# Cache control functions
556
#
557
sub cache {
558
($cache, $cache = 1)[$[];
559
}
560
sub nocache {
561
($cache, $cache = 0)[$[];
562
}
563
sub flush {
564
undef %e2s;
565
undef %s2e;
566
}
567
568
#
569
# X0201 -> X0208 KANA conversion routine
570
#
571
sub h2z_jis {
572
my $s = shift;
573
local($n) = 0;
574
if($$s =~ s/$re_kana([^\e]*)/$esc_0208 . _h2z_jis($1)/geo){
575
1 while $$s =~ s/(($re_jis0208)[^\e]*)($re_jis0208)/$1/o;
576
}
577
$n;
578
}
579
sub _h2z_jis {
580
my $s = shift;
581
$n += $s =~ s/(([\041-\137])([\136\137])?)/$h2z{$1} || $h2z{$2} . $h2z{$3}/geo;
582
$s;
583
}
584
585
sub h2z_euc {
586
my $s = shift;
587
$$s =~ s/\216([\241-\337])(\216([\336\337]))?/$h2z{"$1$3"} || $h2z{$1} . $h2z{$3}/geo;
588
}
589
590
sub h2z_sjis {
591
my $s = shift;
592
my $n = 0;
593
$$s =~ s/(($re_sjis_c)+)|(([\241-\337])([\336\337])?)/
594
$1 || ($n++, $h2z{$3} ? $e2s{$h2z{$3}} || e2s($h2z{$3})
595
: e2s($h2z{$4}) . ($5 && e2s($h2z{$5})))
596
/geo;
597
$n;
598
}
599
600
#
601
# X0208 -> X0201 KANA conversion routine
602
#
603
sub z2h_jis {
604
my $s = shift;
605
local($n) = 0;
606
$$s =~ s/($re_jis0208)([^\e]+)/_z2h_jis($2)/geo;
607
$n;
608
}
609
sub _z2h_jis {
610
my $s = shift;
611
$s =~ s/((\%[!-~]|![\#\"&VW+,<])+|([^!%][!-~]|![^\#\"&VW+,<])+)/__z2h_jis($1)/geo;
612
$s;
613
}
614
sub __z2h_jis {
615
my $s = shift;
616
return $esc_0208 . $s unless $s =~ /^%/o || $s =~ /^![\#\"&VW+,<]/o;
617
$n += length($s) / 2;
618
$s =~ s/(..)/$z2h{$1}/go;
619
$esc_kana . $s;
620
}
621
622
sub z2h_euc {
623
my $s = shift;
624
my $n = 0;
625
init_z2h_euc() unless defined %z2h_euc;
626
$$s =~ s/($re_euc_c|$re_euc_kana)/$z2h_euc{$1} ? ($n++, $z2h_euc{$1}) : $1/geo;
627
$n;
628
}
629
630
sub z2h_sjis {
631
my $s = shift;
632
my $n = 0;
633
init_z2h_sjis() unless defined %z2h_sjis;
634
$$s =~ s/($re_sjis_c)/$z2h_sjis{$1} ? ($n++, $z2h_sjis{$1}) : $1/geo;
635
$n;
636
}
637
638
#
639
# Initializing JIS X0208 to X0201 KANA table for EUC and SJIS. This
640
# can be done in &init but it's not worth doing. Similarly,
641
# precalculated table is not worth to occupy the file space and
642
# reduce the readability. The author personnaly discourages to use
643
# X0201 Kana character in the any situation.
644
#
645
sub init_z2h_euc {
646
my($k, $s);
647
while (($k, $s) = each %z2h){
648
$s =~ s/([\241-\337])/\216$1/go && ($z2h_euc{$k} = $s);
649
}
650
}
651
sub init_z2h_sjis {
652
my($s, $v);
653
while (($s, $v) = each %z2h){
654
$s =~ /[\200-\377]/o && ($z2h_sjis{e2s($s)} = $v);
655
}
656
}
657
658
#
659
# TR function for 2-byte code
660
#
661
sub tr {
662
# $tr_prev_from, $tr_prev_to, %tr_table are persistent variables
663
my($s, $from, $to, $opt) = @_;
664
my $c = getcode($s);
665
if($c eq 'jis' || $c eq 'sjis'){
666
convert($s, 'euc', $c);
667
} else {
668
$c = undef;
669
}
670
if(!defined($tr_prev_from) || $from ne $tr_prev_from || $to ne $tr_prev_to){
671
($tr_prev_from, $tr_prev_to) = ($from, $to);
672
undef %tr_table;
673
_maketable($from, $to, $opt, $c);
674
}
675
my $n = 0;
676
$$s =~ s/([\200-\377][\000-\377]|[\000-\377])/defined($tr_table{$1}) && ++$n ? $tr_table{$1} : $1/geo;
677
if($c){
678
convert($s, $c, 'euc');
679
}
680
$n;
681
}
682
683
sub _maketable {
684
my($from, $to, $opt, $c) = @_;
685
if($c){
686
convert(\$from, 'euc', $c);
687
convert(\$to, 'euc', $c);
688
}
689
$from =~ s/($re_euc_0212-$re_euc_0212)/&_expnd3($1)/geo;
690
$from =~ s/($re_euc_kana-$re_euc_kana)/&_expnd2($1)/geo;
691
$from =~ s/($re_euc_c-$re_euc_c)/&_expnd2($1)/geo;
692
$from =~ s/([\x00-\xff]-[\x00-\xff])/&_expnd1($1)/geo;
693
$to =~ s/($re_euc_0212-$re_euc_0212)/&_expnd3($1)/geo;
694
$to =~ s/($re_euc_kana-$re_euc_kana)/&_expnd2($1)/geo;
695
$to =~ s/($re_euc_c-$re_euc_c)/&_expnd2($1)/geo;
696
$to =~ s/([\x00-\xff]-[\x00-\xff])/&_expnd1($1)/geo;
697
698
my @from = $from =~ /$re_euc_0212|$re_euc_kana|$re_euc_c|[\x00-\xff]/go;
699
my @to = $to =~ /$re_euc_0212|$re_euc_kana|$re_euc_c|[\x00-\xff]/go;
700
push(@to, ($opt =~ /d/o ? '' : $to[-1]) x (@from - @to)) if @to < @from;
701
@tr_table{@from} = @to;
702
}
703
704
sub _expnd1 {
705
my $s = shift;
706
my($c1, $c2) = unpack('CxC', $s);
707
if($c1 <= $c2){
708
for($s = ''; $c1 <= $c2; $c1++){
709
$s .= pack('C', $c1);
710
}
711
}
712
$s;
713
}
714
715
sub _expnd2 {
716
my $s = shift;
717
my($c1, $c2, $c3, $c4) = unpack('CCxCC', $s);
718
if($c1 == $c3 && $c2 <= $c4){
719
for($s = ''; $c2 <= $c4; $c2++){
720
$s .= pack('CC', $c1, $c2);
721
}
722
}
723
$s;
724
}
725
726
sub _expnd3 {
727
my $s = shift;
728
my($c1, $c2, $c3, $c4, $c5, $c6) = unpack('CCCxCCC', $s);
729
if($c1 == $c4 && $c2 == $c5 && $c3 <= $c6){
730
for($s = ''; $c3 <= $c6; $c3++){
731
$s .= pack('CCC', $c1, $c2, $c3);
732
}
733
}
734
$s;
735
}
736
737
1;
738
739
################ small uty to do jp charset transcoding ###########
740
741
my $ocode=shift(@ARGV);
742
my $icode=shift(@ARGV);
743
my $outputdir=shift(@ARGV);
744
745
if ($#ARGV<0) {
746
print "jcode <ocode> <icode> <outputdir> files...\n".
747
"icode/ocode can be one of jis, sjis, euc, uft8, ucs2\n";
748
exit;
749
}
750
751
if ( ! -d $outputdir ) {
752
print "$outputfir not exist\n";
753
exit;
754
}
755
756
if ($icode ne 'jis' &&
757
$icode ne 'sjis' &&
758
$icode ne 'euc' &&
759
$icode ne 'utf8' &&
760
$icode ne 'ucs2') {
761
print "$icode error, should be one of jis, sjis, euc, uft8, ucs2\n";
762
exit;
763
}
764
765
if ($ocode ne 'jis' &&
766
$ocode ne 'sjis' &&
767
$ocode ne 'euc' &&
768
$ocode ne 'utf8' &&
769
$ocode ne 'ucs2') {
770
print "$ocode error, should be one of jis, sjis, euc, uft8, ucs2\n";
771
exit;
772
}
773
774
foreach my $file (@ARGV) {
775
my $file2=$file;
776
$file2=~s!^.*/!!;
777
$file2="$outputdir/$file2";
778
779
open (F, $file) || die "$file read error";
780
open (G, ">$file2") || die "$outputdir/$file write error";
781
while(<F>) {
782
my $line=$_;
783
convert(\$line, $ocode, $icode);
784
print G $line;
785
}
786
close(G);
787
close(F);
788
print "$file2\n";
789
}
790