utf_util.c source code [OpenJDK/src/jdk.jdwp.agent/share/native/libjdwp/utf_util.c]

1	/*
2	* Copyright (c) 1998, 2016, Oracle and/or its affiliates. All rights reserved.
3	* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4	*
5	* This code is free software; you can redistribute it and/or modify it
6	* under the terms of the GNU General Public License version 2 only, as
7	* published by the Free Software Foundation. Oracle designates this
8	* particular file as subject to the "Classpath" exception as provided
9	* by Oracle in the LICENSE file that accompanied this code.
10	*
11	* This code is distributed in the hope that it will be useful, but WITHOUT
12	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14	* version 2 for more details (a copy is included in the LICENSE file that
15	* accompanied this code).
16	*
17	* You should have received a copy of the GNU General Public License version
18	* 2 along with this work; if not, write to the Free Software Foundation,
19	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20	*
21	* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22	* or visit www.oracle.com if you need additional information or have any
23	* questions.
24	*/
25
26	#include <stdlib.h>
27	#include <ctype.h>
28
29	#include "jni.h"
30
31	#include "utf_util.h"
32
33
34	/ Error and assert macros /
35	#define UTF_ERROR(m) utfError(__FILE__, __LINE__, m)
36	#define UTF_ASSERT(x) ( (x)==0 ? UTF_ERROR("ASSERT ERROR " #x) : (void)0 )
37
38	// Platform independed part
39
40	static void utfError(char file, int* line, char *message) {
41	(void)fprintf(stderr, "UTF ERROR [\"%s\":%d]: %s\n", file, line, message);
42	abort();
43	}
44
45	/ Determine length of this Standard UTF-8 in Modified UTF-8.*
46	* Validation is done of the basic UTF encoding rules, returns
47	* length (no change) when errors are detected in the UTF encoding.
48	*
49	* Note: Accepts Modified UTF-8 also, no verification on the
50	* correctness of Standard UTF-8 is done. e,g, 0xC080 input is ok.
51	*/
52	int JNICALL utf8sToUtf8mLength(jbyte string, int* length) {
53	int newLength;
54	int i;
55
56	newLength = `0`;
57	for ( i = `0` ; i < length ; i++ ) {
58	unsigned byte;
59
60	byte = (unsigned char)string[i];
61	if ( (byte & `0x80`) == `0` ) { / 1byte encoding /
62	newLength++;
63	if ( byte == `0` ) {
64	newLength++; / We gain one byte in length on NULL bytes /
65	}
66	} else if ( (byte & `0xE0`) == `0xC0` ) { / 2byte encoding /
67	/ Check encoding of following bytes /
68	if ( (i+`1`) >= length \|\| (string[i+`1`] & `0xC0`) != `0x80` ) {
69	break; / Error condition /
70	}
71	i++; / Skip next byte /
72	newLength += `2`;
73	} else if ( (byte & `0xF0`) == `0xE0` ) { / 3byte encoding /
74	/ Check encoding of following bytes /
75	if ( (i+`2`) >= length \|\| (string[i+`1`] & `0xC0`) != `0x80`
76	\|\| (string[i+`2`] & `0xC0`) != `0x80` ) {
77	break; / Error condition /
78	}
79	i += `2`; / Skip next two bytes /
80	newLength += `3`;
81	} else if ( (byte & `0xF8`) == `0xF0` ) { / 4byte encoding /
82	/ Check encoding of following bytes /
83	if ( (i+`3`) >= length \|\| (string[i+`1`] & `0xC0`) != `0x80`
84	\|\| (string[i+`2`] & `0xC0`) != `0x80`
85	\|\| (string[i+`3`] & `0xC0`) != `0x80` ) {
86	break; / Error condition /
87	}
88	i += `3`; / Skip next 3 bytes /
89	newLength += `6`; / 4byte encoding turns into 2 3byte ones /
90	} else {
91	break; / Error condition /
92	}
93	}
94	if ( i != length ) {
95	/ Error in finding new length, return old length so no conversion /
96	/ FIXUP: ERROR_MESSAGE? /
97	return length;
98	}
99	return newLength;
100	}
101
102	/ Convert Standard UTF-8 to Modified UTF-8.*
103	* Assumes the UTF-8 encoding was validated by utf8mLength() above.
104	*
105	* Note: Accepts Modified UTF-8 also, no verification on the
106	* correctness of Standard UTF-8 is done. e,g, 0xC080 input is ok.
107	*/
108	void JNICALL utf8sToUtf8m(jbyte string, int* length, jbyte newString, int* newLength) {
109	int i;
110	int j;
111
112	j = `0`;
113	for ( i = `0` ; i < length ; i++ ) {
114	unsigned byte1;
115
116	byte1 = (unsigned char)string[i];
117
118	/ NULL bytes and bytes starting with 11110xxx are special /
119	if ( (byte1 & `0x80`) == `0` ) { / 1byte encoding /
120	if ( byte1 == `0` ) {
121	/ Bits out: 11000000 10000000 /
122	newString[j++] = (jbyte)`0xC0`;
123	newString[j++] = (jbyte)`0x80`;
124	} else {
125	/ Single byte /
126	newString[j++] = byte1;
127	}
128	} else if ( (byte1 & `0xE0`) == `0xC0` ) { / 2byte encoding /
129	newString[j++] = byte1;
130	newString[j++] = string[++i];
131	} else if ( (byte1 & `0xF0`) == `0xE0` ) { / 3byte encoding /
132	newString[j++] = byte1;
133	newString[j++] = string[++i];
134	newString[j++] = string[++i];
135	} else if ( (byte1 & `0xF8`) == `0xF0` ) { / 4byte encoding /
136	/ Beginning of 4byte encoding, turn into 2 3byte encodings /
137	unsigned byte2, byte3, byte4, u21;
138
139	/ Bits in: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx /
140	byte2 = (unsigned char)string[++i];
141	byte3 = (unsigned char)string[++i];
142	byte4 = (unsigned char)string[++i];
143	/ Reconstruct full 21bit value /
144	u21 = (byte1 & `0x07`) << `18`;
145	u21 += (byte2 & `0x3F`) << `12`;
146	u21 += (byte3 & `0x3F`) << `6`;
147	u21 += (byte4 & `0x3F`);
148	/ Bits out: 11101101 1010xxxx 10xxxxxx /
149	newString[j++] = (jbyte)`0xED`;
150	newString[j++] = (jbyte)(`0xA0` + (((u21 >> `16`) - `1`) & `0x0F`));
151	newString[j++] = (jbyte)(`0x80` + ((u21 >> `10`) & `0x3F`));
152	/ Bits out: 11101101 1011xxxx 10xxxxxx /
153	newString[j++] = (jbyte)`0xED`;
154	newString[j++] = (jbyte)(`0xB0` + ((u21 >> `6`) & `0x0F`));
155	newString[j++] = byte4;
156	}
157	}
158	UTF_ASSERT(i==length);
159	UTF_ASSERT(j==newLength);
160	newString[j] = (jbyte)`0`;
161	}
162
163	/ Given a Modified UTF-8 string, calculate the Standard UTF-8 length.*
164	* Basic validation of the UTF encoding rules is done, and length is
165	* returned (no change) when errors are detected.
166	*
167	* Note: No validation is made that this is indeed Modified UTF-8 coming in.
168	*
169	*/
170	int JNICALL utf8mToUtf8sLength(jbyte string, int* length) {
171	int newLength;
172	int i;
173
174	newLength = `0`;
175	for ( i = `0` ; i < length ; i++ ) {
176	unsigned byte1, byte2, byte3, byte4, byte5, byte6;
177
178	byte1 = (unsigned char)string[i];
179	if ( (byte1 & `0x80`) == `0` ) { / 1byte encoding /
180	newLength++;
181	} else if ( (byte1 & `0xE0`) == `0xC0` ) { / 2byte encoding /
182	/ Check encoding of following bytes /
183	if ( (i+`1`) >= length \|\| (string[i+`1`] & `0xC0`) != `0x80` ) {
184	break; / Error condition /
185	}
186	byte2 = (unsigned char)string[++i];
187	if ( byte1 != `0xC0` \|\| byte2 != `0x80` ) {
188	newLength += `2`; / Normal 2byte encoding, not 0xC080 /
189	} else {
190	newLength++; / We will turn 0xC080 into 0 /
191	}
192	} else if ( (byte1 & `0xF0`) == `0xE0` ) { / 3byte encoding /
193	/ Check encoding of following bytes /
194	if ( (i+`2`) >= length \|\| (string[i+`1`] & `0xC0`) != `0x80`
195	\|\| (string[i+`2`] & `0xC0`) != `0x80` ) {
196	break; / Error condition /
197	}
198	byte2 = (unsigned char)string[++i];
199	byte3 = (unsigned char)string[++i];
200	newLength += `3`;
201	/ Possible process a second 3byte encoding /
202	if ( (i+`3`) < length && byte1 == `0xED` && (byte2 & `0xF0`) == `0xA0` ) {
203	/ See if this is a pair of 3byte encodings /
204	byte4 = (unsigned char)string[i+`1`];
205	byte5 = (unsigned char)string[i+`2`];
206	byte6 = (unsigned char)string[i+`3`];
207	if ( byte4 == `0xED` && (byte5 & `0xF0`) == `0xB0` ) {
208	/ Check encoding of 3rd byte /
209	if ( (byte6 & `0xC0`) != `0x80` ) {
210	break; / Error condition /
211	}
212	newLength++; / New string will have 4byte encoding /
213	i += `3`; / Skip next 3 bytes /
214	}
215	}
216	} else {
217	break; / Error condition /
218	}
219	}
220	if ( i != length ) {
221	/ Error in UTF encoding /
222	/ FIXUP: ERROR_MESSAGE()? /
223	return length;
224	}
225	return newLength;
226	}
227
228	/ Convert a Modified UTF-8 string into a Standard UTF-8 string*
229	* It is assumed that this string has been validated in terms of the
230	* basic UTF encoding rules by utf8Length() above.
231	*
232	* Note: No validation is made that this is indeed Modified UTF-8 coming in.
233	*
234	*/
235	void JNICALL utf8mToUtf8s(jbyte string, int* length, jbyte newString, int* newLength) {
236	int i;
237	int j;
238
239	j = `0`;
240	for ( i = `0` ; i < length ; i++ ) {
241	unsigned byte1, byte2, byte3, byte4, byte5, byte6;
242
243	byte1 = (unsigned char)string[i];
244	if ( (byte1 & `0x80`) == `0` ) { / 1byte encoding /
245	/ Single byte /
246	newString[j++] = byte1;
247	} else if ( (byte1 & `0xE0`) == `0xC0` ) { / 2byte encoding /
248	byte2 = (unsigned char)string[++i];
249	if ( byte1 != `0xC0` \|\| byte2 != `0x80` ) {
250	newString[j++] = byte1;
251	newString[j++] = byte2;
252	} else {
253	newString[j++] = `0`;
254	}
255	} else if ( (byte1 & `0xF0`) == `0xE0` ) { / 3byte encoding /
256	byte2 = (unsigned char)string[++i];
257	byte3 = (unsigned char)string[++i];
258	if ( i+`3` < length && byte1 == `0xED` && (byte2 & `0xF0`) == `0xA0` ) {
259	/ See if this is a pair of 3byte encodings /
260	byte4 = (unsigned char)string[i+`1`];
261	byte5 = (unsigned char)string[i+`2`];
262	byte6 = (unsigned char)string[i+`3`];
263	if ( byte4 == `0xED` && (byte5 & `0xF0`) == `0xB0` ) {
264	unsigned u21;
265
266	/ Bits in: 11101101 1010xxxx 10xxxxxx /
267	/ Bits in: 11101101 1011xxxx 10xxxxxx /
268	i += `3`;
269
270	/ Reconstruct 21 bit code /
271	u21 = ((byte2 & `0x0F`) + `1`) << `16`;
272	u21 += (byte3 & `0x3F`) << `10`;
273	u21 += (byte5 & `0x0F`) << `6`;
274	u21 += (byte6 & `0x3F`);
275
276	/ Bits out: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx /
277
278	/ Convert to 4byte encoding /
279	newString[j++] = `0xF0` + ((u21 >> `18`) & `0x07`);
280	newString[j++] = `0x80` + ((u21 >> `12`) & `0x3F`);
281	newString[j++] = `0x80` + ((u21 >> `6`) & `0x3F`);
282	newString[j++] = `0x80` + (u21 & `0x3F`);
283	continue;
284	}
285	}
286	/ Normal 3byte encoding /
287	newString[j++] = byte1;
288	newString[j++] = byte2;
289	newString[j++] = byte3;
290	}
291	}
292	UTF_ASSERT(i==length);
293	UTF_ASSERT(j==newLength);
294	newString[j] = `0`;
295	}
296
297	#ifdef _WIN32
298	// Microsoft Windows specific part
299
300	#include <windows.h>
301
302	static UINT getCodepage() {
303	LANGID langID;
304	LCID localeID;
305	TCHAR strCodePage[`7`]; // ANSI code page id
306
307	static UINT intCodePage = -`1`;
308
309	if (intCodePage == -`1`) {
310	// Firts call, get codepage from the os
311	langID = LANGIDFROMLCID(GetUserDefaultLCID());
312	localeID = MAKELCID(langID, SORT_DEFAULT);
313	if (GetLocaleInfo(localeID, LOCALE_IDEFAULTANSICODEPAGE,
314	strCodePage, sizeof(strCodePage)/sizeof(TCHAR)) > `0` ) {
315	intCodePage = atoi(strCodePage);
316	}
317	else {
318	intCodePage = GetACP();
319	}
320	}
321
322	return intCodePage;
323	}
324
325	/*
326	* Get wide string (assumes len>0)
327	*/
328	static WCHAR* getWideString(UINT codePage, char* str, int len, int *pwlen) {
329	int wlen;
330	WCHAR* wstr;
331
332	/ Convert the string to WIDE string /
333	wlen = MultiByteToWideChar(codePage, `0`, str, len, NULL, `0`);
334	*pwlen = wlen;
335	if (wlen <= `0`) {
336	UTF_ERROR(("Can't get WIDE string length"));
337	return NULL;
338	}
339	wstr = (WCHAR)malloc(wlen sizeof(WCHAR));
340	if (wstr == NULL) {
341	UTF_ERROR(("Can't malloc() any space"));
342	return NULL;
343	}
344	if (MultiByteToWideChar(codePage, `0`, str, len, wstr, wlen) == `0`) {
345	UTF_ERROR(("Can't get WIDE string"));
346	return NULL;
347	}
348	return wstr;
349	}
350
351	/*
352	* Convert UTF-8 to a platform string
353	* NOTE: outputBufSize includes the space for the trailing 0.
354	*/
355	int JNICALL utf8ToPlatform(jbyte utf8, int* len, char* output, int outputBufSize) {
356	int wlen;
357	int plen;
358	WCHAR* wstr;
359	UINT codepage;
360	int outputMaxLen;
361
362	UTF_ASSERT(utf8);
363	UTF_ASSERT(output);
364	UTF_ASSERT(len >= `0`);
365	UTF_ASSERT(outputBufSize > len);
366	outputMaxLen = outputBufSize - `1`; // leave space for trailing 0
367
368	/ Zero length is ok, but we don't need to do much /
369	if ( len == `0` ) {
370	output[`0`] = `0`;
371	return `0`;
372	}
373
374	/ Get WIDE string version (assumes len>0) /
375	wstr = getWideString(CP_UTF8, (char*)utf8, len, &wlen);
376	if ( wstr == NULL ) {
377	// Can't allocate WIDE string
378	goto just_copy_bytes;
379	}
380
381	/ Convert WIDE string to MultiByte string /
382	codepage = getCodepage();
383	plen = WideCharToMultiByte(codepage, `0`, wstr, wlen,
384	output, outputMaxLen, NULL, NULL);
385	free(wstr);
386	if (plen <= `0`) {
387	// Can't convert WIDE string to multi-byte
388	goto just_copy_bytes;
389	}
390	output[plen] = `'\0'`;
391	return plen;
392
393	just_copy_bytes:
394	(void)memcpy(output, utf8, len);
395	output[len] = `0`;
396	return len;
397	}
398
399	/*
400	* Convert Platform Encoding to UTF-8.
401	* NOTE: outputBufSize includes the space for the trailing 0.
402	*/
403	int JNICALL utf8FromPlatform(char str, int* len, jbyte output, int* outputBufSize) {
404	int wlen;
405	int plen;
406	WCHAR* wstr;
407	UINT codepage;
408	int outputMaxLen;
409
410	UTF_ASSERT(str);
411	UTF_ASSERT(output);
412	UTF_ASSERT(len >= `0`);
413	UTF_ASSERT(outputBufSize > len);
414	outputMaxLen = outputBufSize - `1`; // leave space for trailing 0
415
416	/ Zero length is ok, but we don't need to do much /
417	if ( len == `0` ) {
418	output[`0`] = `0`;
419	return `0`;
420	}
421
422	/ Get WIDE string version (assumes len>0) /
423	codepage = getCodepage();
424	wstr = getWideString(codepage, str, len, &wlen);
425	if ( wstr == NULL ) {
426	goto just_copy_bytes;
427	}
428
429	/ Convert WIDE string to UTF-8 string /
430	plen = WideCharToMultiByte(CP_UTF8, `0`, wstr, wlen,
431	(char*)output, outputMaxLen, NULL, NULL);
432	free(wstr);
433	if (plen <= `0`) {
434	UTF_ERROR(("Can't convert WIDE string to multi-byte"));
435	goto just_copy_bytes;
436	}
437	output[plen] = `'\0'`;
438	return plen;
439
440	just_copy_bytes:
441	(void)memcpy(output, str, len);
442	output[len] = `0`;
443	return len;
444	}
445
446
447	#else
448	// NIX specific part*
449
450	#include <iconv.h>
451	#include <locale.h>
452	#include <langinfo.h>
453	#include <string.h>
454
455	typedef enum {TO_UTF8, FROM_UTF8} conv_direction;
456
457	/*
458	* Do iconv() conversion.
459	* Returns length or -1 if output overflows.
460	* NOTE: outputBufSize includes the space for the trailing 0.
461	*/
462	static int iconvConvert(conv_direction drn, char bytes, size_t len, char* *output, size_t outputBufSize) {
463
464	static char *codeset = `0`;
465	iconv_t func;
466	size_t bytes_converted;
467	size_t inLeft, outLeft;
468	char inbuf, outbuf;
469	int outputMaxLen;
470
471	UTF_ASSERT(bytes);
472	UTF_ASSERT(output);
473	UTF_ASSERT(outputBufSize > len);
474	outputMaxLen = outputBufSize - `1`; // leave space for trailing 0
475
476	/ Zero length is ok, but we don't need to do much /
477	if ( len == `0` ) {
478	output[`0`] = `0`;
479	return `0`;
480	}
481
482	if (codeset == NULL && codeset != (char *) -`1`) {
483	// locale is not initialized, do it now
484	if (setlocale(LC_ALL, "") != NULL) {
485	// nl_langinfo returns ANSI_X3.4-1968 by default
486	codeset = (char*)nl_langinfo(CODESET);
487	}
488
489	if (codeset == NULL) {
490	// Not able to intialize process locale from platform one.
491	codeset = (char *) -`1`;
492	}
493	}
494
495	if (codeset == (char *) -`1`) {
496	// There was an error during initialization, so just bail out
497	goto just_copy_bytes;
498	}
499
500	func = (drn == TO_UTF8) ? iconv_open(codeset, "UTF-8") : iconv_open("UTF-8", codeset);
501	if (func == (iconv_t) -`1`) {
502	// Requested charset combination is not supported, conversion couldn't be done.
503	// make sure we will not try it again
504	codeset = (char *) -`1`;
505	goto just_copy_bytes;
506	}
507
508	// perform conversion
509	inbuf = bytes;
510	outbuf = output;
511	inLeft = len;
512	outLeft = outputMaxLen;
513
514	bytes_converted = iconv(func, (void*)&inbuf, &inLeft, &outbuf, &outLeft);
515	if (bytes_converted == (size_t) -`1` \|\| bytes_converted == `0` \|\| inLeft != `0`) {
516	// Input string is invalid, not able to convert entire string
517	// or some other iconv error happens.
518	iconv_close(func);
519	goto just_copy_bytes;
520	}
521
522	iconv_close(func);
523	// Overwrite bytes_converted with value of actually stored bytes
524	bytes_converted = outputMaxLen-outLeft;
525	output[bytes_converted] = `0`;
526	return bytes_converted;
527
528
529	just_copy_bytes:
530	(void)memcpy(output, bytes, len);
531	output[len] = `0`;
532	return len;
533	}
534
535	/*
536	* Convert UTF-8 to Platform Encoding.
537	* Returns length or -1 if output overflows.
538	* NOTE: outputBufSize includes the space for the trailing 0.
539	*/
540	int JNICALL utf8ToPlatform(jbyte utf8, int* len, char output, int* outputBufSize) {
541	return iconvConvert(FROM_UTF8, (char*)utf8, len, output, outputBufSize);
542	}
543
544	/*
545	* Convert Platform Encoding to UTF-8.
546	* Returns length or -1 if output overflows.
547	* NOTE: outputBufSize includes the space for the trailing 0.
548	*/
549	int JNICALL utf8FromPlatform(char str, int* len, jbyte output, int* outputBufSize) {
550	return iconvConvert(TO_UTF8, str, len, (char*) output, outputBufSize);
551	}
552
553	#endif
554

Browse the source code of OpenJDK/src/jdk.jdwp.agent/share/native/libjdwp/utf_util.c