@@ -675,10 +675,10 @@ static void crcheck(char *lbuff, FILE *fp)
675675static void index_normalize (UChar * istr , UChar * ini , int * chset )
676676{
677677 int k , hi , lo , mi ;
678- UChar ch ,src [2 ],dest [8 ],strX [4 ],strY [4 ],strZ [4 ];
678+ UChar ch ,src [2 ],dest [8 ],strX [4 ],strY [4 ],strZ [4 ], strW [ 4 ] ;
679679 UChar32 c32 ;
680680 UErrorCode perr ;
681- UCollationResult order ,order1 ;
681+ UCollationResult order ,order1 , order2 , order3 , order4 , order5 ;
682682 UCollationStrength strgth ;
683683 static int i_y_mode = 0 ,o_o_mode = 0 ,u_u_mode = 0 ,v_w_mode = 0 ,s_s_mode = 0 ,t_t_mode = 0 ;
684684
@@ -1018,26 +1018,70 @@ static void index_normalize(UChar *istr, UChar *ini, int *chset)
10181018 ini [0 ] = 0x21A ; return ;
10191019 }
10201020 }
1021- if (ch == 0x0D6 || ch == 0x0F6 || ch == 0x150 || ch == 0x151 ) {
1022- /* check Ö,ö versus Ő,ő for Hungarian */
1021+ if (ch == 0x0D6 || ch == 0x0F6 || ch == 0x150 || ch == 0x151
1022+ || ch == 0x0D8 || ch == 0x0F8 || ch == 0x0D5 || ch == 0x0F5 ) {
1023+ /* check Ö,ö versus Ő,ő for Hungarian
1024+ Ø,ø versus Ö,ö for Danish, Norwegian
1025+ Ö,ö versus Ø,ø,Ő,ő,Õ,õ for Finnish SFS 4600 */
10231026 if (o_o_mode == 0 ) {
10241027 strgth = ucol_getStrength (icu_collator );
10251028 ucol_setStrength (icu_collator , UCOL_PRIMARY );
10261029 strX [0 ] = 0x0D6 ; strX [1 ] = 0x00 ; /* Ö */
1027- strY [0 ] = 0x150 ; strY [1 ] = 0x00 ; /* Ő */
1030+ strY [0 ] = 0x0D8 ; strY [1 ] = 0x00 ; /* Ø */
10281031 strZ [0 ] = 0x04F ; strZ [1 ] = 0x00 ; /* O */
1029- order = ucol_strcoll (icu_collator , strY , -1 , strX , -1 );
1030- order1 = ucol_strcoll (icu_collator , strZ , -1 , strX , -1 );
1031- o_o_mode = (order == UCOL_EQUAL && order1 != UCOL_EQUAL ) ? 2 : 1 ;
1032+ order = ucol_strcoll (icu_collator , strZ , -1 , strX , -1 );
1033+ order1 = ucol_strcoll (icu_collator , strZ , -1 , strY , -1 );
1034+ if (order == UCOL_LESS || order1 == UCOL_LESS ) {
1035+ o_o_mode = 2 ;
1036+ } else {
1037+ o_o_mode = 1 ;
1038+ }
10321039 ucol_setStrength (icu_collator , strgth );
10331040 }
10341041 if (o_o_mode == 2 ) {
1042+ strgth = ucol_getStrength (icu_collator );
1043+ ucol_setStrength (icu_collator , UCOL_SECONDARY );
1044+ strX [0 ] = 0x0D6 ; strX [1 ] = 0x00 ; /* Ö */
1045+ strY [0 ] = 0x0D8 ; strY [1 ] = 0x00 ; /* Ø */
1046+ strZ [0 ] = 0x150 ; strZ [1 ] = 0x00 ; /* Ő */
1047+ strW [0 ] = 0x0D5 ; strZ [1 ] = 0x00 ; /* Õ */
1048+ order2 = ucol_strcoll (icu_collator , strY , -1 , strZ , -1 );
1049+ order3 = ucol_strcoll (icu_collator , strZ , -1 , strX , -1 );
1050+ order4 = ucol_strcoll (icu_collator , strY , -1 , strX , -1 );
1051+ order5 = ucol_strcoll (icu_collator , strW , -1 , strX , -1 );
1052+ if (order1 == UCOL_LESS && order4 == UCOL_LESS ) {
1053+ o_o_mode = 3 ; /* O < Ø << Ö */
1054+ if (order2 == UCOL_LESS )
1055+ o_o_mode = 4 ; /* O < Ø << Ö and O < Ø << Ő */
1056+ } else if (order == UCOL_LESS && order4 == UCOL_GREATER ) {
1057+ o_o_mode = 6 ; /* O < Ö << Ø */
1058+ if (order3 == UCOL_GREATER )
1059+ o_o_mode = 7 ; /* O < Ö << Ø and O < Ö << Ő */
1060+ if (order3 == UCOL_GREATER && order5 == UCOL_GREATER )
1061+ o_o_mode = 8 ; /* O < Ö << Ø and O < Ö << Ő and O < Ö << Õ */
1062+ } else if (order == UCOL_LESS && order3 == UCOL_GREATER ) {
1063+ o_o_mode = 5 ; /* O < Ö << Ő */
1064+ }
1065+ ucol_setStrength (icu_collator , strgth );
1066+ }
1067+ if ((o_o_mode == 3 && (ch == 0x0D6 || ch == 0x0F6 )) || /* Ö */
1068+ (o_o_mode == 4 && (ch == 0x150 || ch == 0x151 || ch == 0x0D6 || ch == 0x0F6 )) || /* Ö,Ő */
1069+ (o_o_mode >=3 && o_o_mode <=4 && (ch == 0x0D8 || ch == 0x0F8 ))) { /* Ø */
1070+ ini [0 ] = 0x0D8 ; /* Ø */
1071+ return ;
1072+ }
1073+ if ((o_o_mode == 5 && (ch == 0x150 || ch == 0x151 )) || /* Ő */
1074+ (o_o_mode == 6 && (ch == 0x0D8 || ch == 0x0F8 )) || /* Ø */
1075+ (o_o_mode == 7 && (ch == 0x150 || ch == 0x151 || ch == 0x0D8 || ch == 0x0F8 )) || /* Ő,Ø */
1076+ (o_o_mode == 8 && (ch == 0x150 || ch == 0x151 ||
1077+ ch == 0x0D8 || ch == 0x0F8 || ch == 0x0D5 || ch == 0x0F5 )) || /* Ő,Ø,Õ */
1078+ (o_o_mode >=5 && o_o_mode <=8 && (ch == 0x0D6 || ch == 0x0F6 ))) { /* Ö */
10351079 ini [0 ] = 0x0D6 ; /* Ö */
10361080 return ;
10371081 }
10381082 }
10391083 if (ch == 0x0DC || ch == 0x0FC || ch == 0x170 || ch == 0x171 ) {
1040- /* check Ü,ü versus Ű,ű for Hungarian */
1084+ /* check Ü,ü versus Ű,ű for Hungarian, and for Finnish SFS 4600 */
10411085 if (u_u_mode == 0 ) {
10421086 strgth = ucol_getStrength (icu_collator );
10431087 ucol_setStrength (icu_collator , UCOL_PRIMARY );
@@ -1046,22 +1090,33 @@ static void index_normalize(UChar *istr, UChar *ini, int *chset)
10461090 strZ [0 ] = 0x055 ; strZ [1 ] = 0x00 ; /* U */
10471091 order = ucol_strcoll (icu_collator , strY , -1 , strX , -1 );
10481092 order1 = ucol_strcoll (icu_collator , strZ , -1 , strX , -1 );
1049- u_u_mode = (order == UCOL_EQUAL && order1 != UCOL_EQUAL ) ? 2 : 1 ;
1093+ if (order == UCOL_EQUAL && order1 != UCOL_EQUAL ) {
1094+ strZ [0 ] = 0x059 ; /* Y */
1095+ order1 = ucol_strcoll (icu_collator , strZ , -1 , strX , -1 );
1096+ u_u_mode = (order1 == UCOL_EQUAL ) ? 3 : 2 ;
1097+ } else {
1098+ u_u_mode = 1 ;
1099+ }
10501100 ucol_setStrength (icu_collator , strgth );
10511101 }
10521102 if (u_u_mode == 2 ) {
10531103 ini [0 ] = 0x0DC ; /* Ü */
10541104 return ;
1105+ } else if (o_o_mode == 3 ) {
1106+ ini [0 ] = 0x059 ; /* Y */
1107+ return ;
10551108 }
10561109 }
10571110 if (ch == 0x0C6 || ch == 0x0E6 || ch == 0x152 || ch == 0x153 || ch == 0x132 || ch == 0x133
10581111 || ch == 0x0DF || ch == 0x1E9E || ch == 0x13F || ch == 0x140 || ch == 0x149 || ch == 0x490 || ch == 0x491 ) {
10591112 strX [0 ] = u_toupper (ch ); strX [1 ] = 0x00 ; /* ex. "Æ" "Œ" */
10601113 switch (ch ) {
10611114 case 0x0C6 : case 0x0E6 : /* Æ æ */
1062- strZ [0 ] = 0x41 ; break ; /* A */
1115+ strZ [0 ] = 0x41 ; /* A */
1116+ strW [0 ] = 0xC4 ; break ; /* Ä */
10631117 case 0x152 : case 0x153 : /* Œ œ */
1064- strZ [0 ] = 0x4F ; break ; /* O */
1118+ strZ [0 ] = 0x4F ; /* O */
1119+ strW [0 ] = 0xD6 ; break ; /* Ö */
10651120 case 0x0DF : case 0x1E9E : /* ß ẞ */
10661121 strZ [0 ] = 0x53 ; break ; /* S */
10671122 case 0x132 : case 0x133 : /* IJ ij */
@@ -1080,6 +1135,21 @@ static void index_normalize(UChar *istr, UChar *ini, int *chset)
10801135 strZ [2 ] = 0x00 ; /* ex. "AZ" "OZ" "ГЯ" */
10811136 order = ucol_strcoll (icu_collator , strZ , -1 , strX , -1 );
10821137 if (order == UCOL_GREATER ) { ini [0 ]= strZ [0 ]; return ; } /* not ligature */
1138+
1139+ if (ch == 0x0C6 || ch == 0x0E6 || ch == 0x152 || ch == 0x153 ) {
1140+ /* check Æ,Œ versus Ä,Ö for Finnish */
1141+ strW [1 ] = 0x00 ;
1142+ strgth = ucol_getStrength (icu_collator );
1143+ ucol_setStrength (icu_collator , UCOL_PRIMARY );
1144+ order = ucol_strcoll (icu_collator , strW , -1 , strX , -1 );
1145+ ucol_setStrength (icu_collator , UCOL_SECONDARY );
1146+ order1 = ucol_strcoll (icu_collator , strW , -1 , strX , -1 );
1147+ strgth = ucol_getStrength (icu_collator );
1148+ if (order == UCOL_EQUAL ) {
1149+ ini [0 ] = (order1 == UCOL_GREATER ) ? strX [0 ] : strW [0 ];
1150+ return ;
1151+ }
1152+ }
10831153 }
10841154 else if ((is_latin (& ch )&& ch > 0x7F )||
10851155 (is_cyrillic (& ch )&& (ch < 0x410 || ch == 0x419 || ch == 0x439 || ch > 0x44F ))||
@@ -1094,7 +1164,20 @@ static void index_normalize(UChar *istr, UChar *ini, int *chset)
10941164 strZ [0 ] = u_toupper (dest [0 ]); strZ [2 ] = 0x00 ; /* ex. "AZ" */
10951165 strX [0 ] = u_toupper (ch ); strX [1 ] = 0x00 ; /* ex. "Å" */
10961166 order = ucol_strcoll (icu_collator , strZ , -1 , strX , -1 );
1097- if (order == UCOL_LESS ) { ini [0 ]= strX [0 ]; return ; } /* with diacritic */
1167+ if (order == UCOL_LESS ) { /* with diacritic */
1168+ if (strX [0 ]!= 0xC4 ) { /* Ä */
1169+ ini [0 ]= strX [0 ]; return ;
1170+ }
1171+ strZ [0 ] = 0x0C6 ; strZ [1 ] = 0x00 ; /* Æ */
1172+ strgth = ucol_getStrength (icu_collator );
1173+ ucol_setStrength (icu_collator , UCOL_PRIMARY );
1174+ order = ucol_strcoll (icu_collator , strZ , -1 , strX , -1 );
1175+ ucol_setStrength (icu_collator , UCOL_SECONDARY );
1176+ order1 = ucol_strcoll (icu_collator , strZ , -1 , strX , -1 );
1177+ strgth = ucol_getStrength (icu_collator );
1178+ ini [0 ] = (order == UCOL_EQUAL && order1 == UCOL_LESS ) ? strZ [0 ] : strX [0 ];
1179+ return ;
1180+ }
10981181 ch = dest [0 ]; /* without diacritic */
10991182 }
11001183 }
@@ -1151,6 +1234,25 @@ static void index_normalize(UChar *istr, UChar *ini, int *chset)
11511234 return ;
11521235 }
11531236 }
1237+ /* AA for Norwegian, Danish */
1238+ if (strX [0 ]== 0x41 && strX [1 ]== 0x41 ) { /* AA */
1239+ strX [2 ]= L'\0' ;
1240+ strY [0 ]= 0xC5 ; strY [1 ]= L'\0' ; /* Å */
1241+ strZ [0 ]= 0x41 ; strZ [1 ]= 0x42 ; strZ [3 ]= L'\0' ; /* AB */
1242+ order = ucol_strcoll (icu_collator , strZ , -1 , strX , -1 );
1243+ ucol_setStrength (icu_collator , UCOL_PRIMARY );
1244+ order1 = ucol_strcoll (icu_collator , strY , -1 , strX , -1 );
1245+ strgth = ucol_getStrength (icu_collator );
1246+ if (order == UCOL_LESS ) {
1247+ if (order1 == UCOL_EQUAL ) {
1248+ ini [0 ]= strY [0 ]; ini [1 ]= L'\0' ; /* Å */
1249+ } else {
1250+ ini [0 ]= strX [0 ]; ini [1 ]= strX [1 ]; /* AA */
1251+ ini [2 ]= L'\0' ;
1252+ }
1253+ return ;
1254+ }
1255+ }
11541256 /* other digraphs */
11551257 if (((strX [0 ]== 0x43 || strX [0 ]== 0x44 || strX [0 ]== 0x50 || strX [0 ]== 0x52 || strX [0 ]== 0x53 || strX [0 ]== 0x54 ||
11561258 strX [0 ]== 0x58 || strX [0 ]== 0x5A ) && strX [1 ]== 0x48 ) || /* CH DH PH RH SH TH XH ZH */
0 commit comments