Subject Re: Creating GB18030 character set and collation
Author peter_jacobi.rm
This short utility, linked to libiconv or using iconv
of a Posix system, will dump the necessary Unicode => GB18030
mapping tables.

This is the some data which is in src/intl/charsets/gb2312
for GB2312. You can specify the actual charset wanted on
the command line, e.g. GBK or GB18030.

The inverse mapping table which goes into the same header
is missing, but should be easy to add.

All four byte GB18030 sequences are ignored and I don't
volunteer to go into a Chinese jail for this offense. Hope
you'll get through with this.

#include <assert.h>
#include <stdio.h>

#include "include/iconv.h"

int main (int argc, char *argv []) {
iconv_t cv;
assert (argc == 2);
cv = iconv_open (argv [1], "UTF-16LE");
fprintf (stderr, "'%s' %d\n",
argv [1], (int) cv);
if (cv != (iconv_t) -1) {
int i, j;
unsigned short uc;
char inbuf [2];
char outbuf [8];
int offset = 256;
int offsetmap [256] = {0};
printf ("\n/* to_unicode_mapping_array */\n", (int) offsetmap
[i]);
for (j = 0; j < 256; ++j) {
printf ("0,\n");
}
for (i = 0; i < 256; ++i) {
int mapped = 0;
unsigned short gb [256] = {0};
for (j = 0; j < 256; ++j) {
size_t inbytesleft = 2;
char const *pinbuf = inbuf;
size_t outbytesleft = 8;
char *poutbuf = outbuf;
uc = 256 * i + j;
inbuf [0] = j;
inbuf [1] = i;
outbuf [0] = 0;
outbuf [1] = 0;
iconv (cv, &pinbuf, &inbytesleft, &poutbuf,
&outbytesleft);
fprintf (stderr, "%4.4x %2.2x %2.2x %1d %1d\n",
(int) uc, (int) outbuf [0], (int) outbuf [1],
2 - inbytesleft, 8 - outbytesleft);
iconv_close (cv);
cv = iconv_open (argv [1], "UTF-16LE");

if (inbytesleft == 0 && outbytesleft >= 6) {
++mapped;
gb [j] = (unsigned char) outbuf [0] +
256 * (unsigned char) outbuf [1];
}
}
if (mapped > 0) {
for (j = 0; j < 256; ++j) {
if (gb [j] == 0) {
printf ("0,\n");
} else {
printf ("0x%4.4x,\n", (int) gb [j]);
}
}
offsetmap [i] = offset;
offset += 256;
}
}
printf ("\n/* to_unicode_map */\n", (int) offsetmap [i]);
for (i = 0; i < 256; ++i) {
printf ("%d,\n", (int) offsetmap [i]);
}
}
return 0;
}