[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
dev.16 patch (was: Re: lynx-dev Re: 283dev15 for Win32)
From: |
Hataguchi Takeshi |
Subject: |
dev.16 patch (was: Re: lynx-dev Re: 283dev15 for Win32) |
Date: |
Mon, 13 Dec 1999 01:24:38 +0900 (JST) |
I checked the behavior with half width katakana
and wrote a patch for dev.16.
On Mon, 6 Dec 1999, Klaus Weide wrote:
> running on Windows or something else. Yet it seems a lot of the more
> recently added code for Japanese is Windows-specific. It seems I don't
> even understand the problem, so no surprise that I don't understand the
> solutions.
Really?
I believe almost all Hiroyuki's code for Japanese is ifdef'd by
CJK_EX and isn't Windows-specific. I havn't looked at all code
ifdef'd by CJK_EX yet, but it shouldn't be Windows-specific.
> > Anyway,
> > I meant all those characters refered to as "HALFWIDTH" in the document (I
> > think called "SHIFTJIS.TXT"):
> > # Name: Shift-JIS to Unicode
> > # Unicode version: 1.1
> > # Table version: 0.9
> > # Table format: Format A
> > # Date: 8 March 1994
> > # Authors: Glenn Adams <address@hidden>
> > # John H. Jenkins <address@hidden>
> > #
> > # Copyright (c) 1991-1994 Unicode, Inc. All Rights reserved.
> > [...]
> > 0xA1 0xFF61 # HALFWIDTH IDEOGRAPHIC FULL STOP
> [...]
> > 0xDF 0xFF9F # HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK
>
> Thank you for the explanation.
>
> The existence of those 1-byte codes is something I totally neglected
> in my recent changes (for WHEREIS search highlighting glitches, I think
> you know what I mean). That means that the code should be correct for
> EUC-JP, but still not for Shift-JIS. (Since WHEREIS operates on the
> end result of Lynx's formatting and conversions, I suppose it should be
> correct for Display Character Set == "Japanese (EUC-JP)" and incorrect
> for D.C.S. == "Japanese (Shift_JIS)", independent of the original charset
> of the document as transmitted, as long as Lynx's conversion was otherwise
> correct.)
There is a code for half width katakana, but we don't always have
fonts for it. So I think it's better Lynx converts half width katakana
into full width to display. If CJK_EX is defined,
Lynx actually does for almost all cases.
# Lynx didn't convert in the source mode, but my patch will improve it.
It seems (WHEREIS search) highlighting works well if CJK_EX is defined,
but it doesn't if not defined because half width katakana can be
in the screen. I think Lynx should always convert half width katakana
into full width. Are there any side effects?
There may be other wrong effects with Lynx when a document includes
half width katakana. For example, which I found, Lynx fails to parse
TAGs in the below case.
X<p> (Assume X is half width katakana)
# Precisely speaking, half width katakana is one byte in Shift_JIS and
# is two byte in EUC-JP. Lynx fails only when it's written in Shift_JIS.
I'll attach the example file, you can try it with setting
Display Character Set as Japanese (Shift_JIS or EUC-JP)
without Japanese font. Applying my patch, Lynx can parse it
as expected (I believe).
About this patch:
- Correct the behavior with odd numbers of one byte katakana
followed by ascii characters.
- Some small changes to guess the code set of Japanese.
- Make it can be compiled with SH_EX under not Windows system.
- If CJK_EX is defined, convert half width katakana into full width
in the source mode.
diff -bru orig/lynx2-8-3/WWW/Library/Implementation/HTCJK.h
lynx2-8-3/WWW/Library/Implementation/HTCJK.h
--- orig/lynx2-8-3/WWW/Library/Implementation/HTCJK.h Sat Jul 31 00:39:54 1999
+++ lynx2-8-3/WWW/Library/Implementation/HTCJK.h Mon Dec 13 00:36:30 1999
@@ -37,11 +37,20 @@
#define IS_SJIS_HI1(hi) ((0x81<=hi)&&(hi<=0x9F)) /* 1st lev. */
#define IS_SJIS_HI2(hi) ((0xE0<=hi)&&(hi<=0xEF)) /* 2nd lev. */
#define IS_SJIS(hi,lo,in_sjis)
(!IS_SJIS_LO(lo)?0:IS_SJIS_HI1(hi)?(in_sjis=1):in_sjis&&IS_SJIS_HI2(hi))
+#define IS_SJIS_2BYTE(hi,lo)
(IS_SJIS_LO(lo)&&(IS_SJIS_HI1(hi)||IS_SJIS_HI2(hi)))
+#define IS_SJIS_HWKANA(lo) ((0xA1<=lo)||(lo<=0xDF))
+#if 0 /* This doesn't seemed to be valid code.
+ * ref: http://www.isi.edu/in-notes/iana/assignments/character-sets
+ */
#define IS_EUC_LOS(lo) ((0x21<=lo)&&(lo<=0x7E)) /* standard */
+#endif
#define IS_EUC_LOX(lo) ((0xA1<=lo)&&(lo<=0xFE)) /* extended */
#define IS_EUC_HI(hi) ((0xA1<=hi)&&(hi<=0xFE))
-#define IS_EUC(hi,lo) (IS_EUC_HI(hi) && (IS_EUC_LOS(lo) || IS_EUC_LOX(lo)))
+#define IS_EUC_HWKANA(hi,lo) ((hi==0x8E)&&((0xA1<=lo)||(lo<=0xDF)))
+#define IS_EUC(hi,lo) ((IS_EUC_HI(hi) && IS_EUC_LOX(lo))||IS_EUC_HWKANA(hi,lo))
+
+#define IS_JAPANESE_2BYTE(hi,lo) (IS_SJIS_2BYTE(hi,lo) || IS_EUC(hi,lo))
#define IS_BIG5_LOS(lo) ((0x40<=lo)&&(lo<=0x7E)) /* standard */
#define IS_BIG5_LOX(lo) ((0xA1<=lo)&&(lo<=0xFE)) /* extended */
diff -bru orig/lynx2-8-3/WWW/Library/Implementation/SGML.c
lynx2-8-3/WWW/Library/Implementation/SGML.c
--- orig/lynx2-8-3/WWW/Library/Implementation/SGML.c Thu Nov 4 11:41:38 1999
+++ lynx2-8-3/WWW/Library/Implementation/SGML.c Mon Dec 13 00:37:00 1999
@@ -1477,6 +1477,7 @@
static unsigned char sjis_1st = '\0';
unsigned char sjis_hi, sjis_lo;
#endif
+ static unsigned char kanji_buf;
c = c_in;
clong = (unsigned char)c; /* a.k.a. unsign_c */
@@ -1690,7 +1691,7 @@
HTCJK == NOCJK)
goto after_switch;
-#ifdef CJK_EX /* 1998/11/24 (Tue) 17:02:31 */
+#if 0 /* This should be a business of GridText */
if (HTCJK == JAPANESE && last_kcode == SJIS) {
if (sjis_1st == '\0' && (IS_SJIS_HI1(c) || IS_SJIS_HI2(c))) {
sjis_1st = c;
@@ -1727,6 +1728,22 @@
!(PASSHICTRL || HTCJK != NOCJK))
goto after_switch;
+ if ((HTCJK==JAPANESE) && (context->state==S_in_kanji) &&
+ !IS_JAPANESE_2BYTE(kanji_buf,(unsigned char)c)) {
+#if CJK_EX
+ if (IS_SJIS_HWKANA(kanji_buf) && (last_kcode == SJIS)) {
+ JISx0201TO0208_SJIS(kanji_buf, &sjis_hi, &sjis_lo);
+ PUTC(sjis_hi);
+ PUTC(sjis_lo);
+ }
+ else
+ PUTC('=');
+#else
+ PUTC('=');
+#endif
+ context->state = S_text;
+ }
+
/*
** Handle character based on context->state.
*/
@@ -1744,6 +1761,7 @@
** (see below). - FM
*/
context->state = S_text;
+ PUTC(kanji_buf);
PUTC(c);
break;
@@ -1772,7 +1790,7 @@
** to having raw mode off with CJK. - FM
*/
context->state = S_in_kanji;
- PUTC(c);
+ kanji_buf = c;
break;
} else if (HTCJK != NOCJK && TOASCII(c) == '\033') { /* S/390 -- gil
-- 0881 */
/*
diff -bru orig/lynx2-8-3/src/GridText.c lynx2-8-3/src/GridText.c
--- orig/lynx2-8-3/src/GridText.c Wed Dec 1 12:33:02 1999
+++ lynx2-8-3/src/GridText.c Mon Dec 13 00:36:12 1999
@@ -3623,7 +3623,11 @@
/*
* JIS X0201 Kana in SJIS support. - by ASATAKU
*/
+#ifdef CJK_EX
+ if (((text->kcode == SJIS) || (last_kcode == SJIS)) &&
+#else
if ((text->kcode == SJIS) &&
+#endif
((unsigned char)ch >= 0xA1) &&
((unsigned char)ch <= 0xDF))
{
@@ -4053,6 +4057,14 @@
lo = (unsigned char)ch;
if (HTCJK == JAPANESE) {
+ /* should be a better guesser */
+ if (IS_EUC(hi, lo) && ! IS_SJIS_2BYTE(hi, lo)) {
+ text->kcode = EUC;
+ }
+ else if (!IS_EUC(hi, lo) && IS_SJIS_2BYTE(hi, lo)) {
+ text->kcode = SJIS;
+ }
+#if 0
if (text->kcode == NOKANJI)
{
if (IS_SJIS(hi, lo, text->in_sjis) && IS_EUC(hi, lo)) {
@@ -4063,6 +4075,7 @@
text->kcode = EUC;
}
}
+#endif
switch (kanji_code) {
case EUC:
@@ -4070,7 +4083,7 @@
SJIS_TO_EUC1(hi, lo, tmp);
line->data[line->size++] = tmp[0];
line->data[line->size++] = tmp[1];
- } else if (text->kcode == EUC) {
+ } else if (IS_EUC(hi, lo)) {
JISx0201TO0208_EUC(hi, lo, &hi, &lo);
line->data[line->size++] = hi;
line->data[line->size++] = lo;
diff -bru orig/lynx2-8-3/src/HTML.c lynx2-8-3/src/HTML.c
--- orig/lynx2-8-3/src/HTML.c Wed Dec 1 12:33:02 1999
+++ lynx2-8-3/src/HTML.c Sun Dec 12 23:35:22 1999
@@ -8710,19 +8710,19 @@
#ifdef SH_EX /* 1998/04/02 (Thu) 16:02:00 */
/* for proxy server 1998/12/19 (Sat) 11:53:30 */
- if (stricmp(newtitle + 1, "internal-gopher-menu") == 0) {
+ if (AS_casecomp(newtitle + 1, "internal-gopher-menu") == 0) {
StrAllocCopy(newtitle, "+");
- } else if (stricmp(newtitle + 1, "internal-gopher-unknown") == 0) {
+ } else if (AS_casecomp(newtitle + 1, "internal-gopher-unknown") == 0) {
StrAllocCopy(newtitle, " ");
} else {
/* normal title */
ptr = strrchr(newtitle, '.');
if (ptr) {
- if (stricmp(ptr, ".gif") == 0)
+ if (AS_casecomp(ptr, ".gif") == 0)
*ptr = '\0';
- else if (stricmp(ptr, ".jpg") == 0)
+ else if (AS_casecomp(ptr, ".jpg") == 0)
*ptr = '\0';
- else if (stricmp(ptr, ".jpeg") == 0)
+ else if (AS_casecomp(ptr, ".jpeg") == 0)
*ptr = '\0';
}
StrAllocCat(newtitle, "]");
diff -bru orig/lynx2-8-3/src/LYMail.c lynx2-8-3/src/LYMail.c
--- orig/lynx2-8-3/src/LYMail.c Wed Sep 29 20:40:38 1999
+++ lynx2-8-3/src/LYMail.c Sun Dec 12 23:35:22 1999
@@ -2048,7 +2048,7 @@
while ((n = fread(buf, 1, sizeof(buf), fd)) != 0) {
fwrite(buf, 1, n, fp);
}
-#if defined(DOSPATH) || defined(SH_EX)
+#if defined(DOSPATH) || (defined(SH_EX) && defined(WIN_EX))
#ifdef SH_EX /* 1998/05/04 (Mon) 22:40:35 */
if (mail_is_blat) {
StrAllocCopy(command,
--
Takeshi Hataguchi
E-mail: address@hidden
h_w_kana.html
Description: Binary data
- dev.16 patch (was: Re: lynx-dev Re: 283dev15 for Win32),
Hataguchi Takeshi <=