Damn!!! Symbian had the correct BPE Script,since microsoft took over they removed symbian site which had the script,very unfortunate.
could someone compile this
// -*- mode:c++; tab-width:2; indent-tabs-mode:nil; c-basic-offset:2 -*-
2
3 /*
4 * Copyright (C) 2010-2011 ZXing authors
5 *
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 *
http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18
19 #include <zxing/common/StringUtils.h>
20 #include <zxing/DecodeHints.h>
21
22 using namespace std;
23 using namespace zxing;
24 using namespace zxing::common;
25
26 // N.B.: these are the iconv strings for at least some versions of iconv
27
28 char const* const StringUtils:
LATFORM_DEFAULT_ENCODING = "UTF-8";
29 char const* const StringUtils::ASCII = "ASCII";
30 char const* const StringUtils::SHIFT_JIS = "SHIFT_JIS";
31 char const* const StringUtils::GB2312 = "GBK";
32 char const* const StringUtils::EUC_JP = "EUC-JP";
33 char const* const StringUtils::UTF8 = "UTF-8";
34 char const* const StringUtils::ISO88591 = "ISO8859-1";
35 const bool StringUtils::ASSUME_SHIFT_JIS = false;
36
37 string
38 StringUtils::guessEncoding(unsigned char* bytes, int length, Hashtable const& hints) {
39 Hashtable::const_iterator i = hints.find(DecodeHints::CHARACTER_SET);
40 if (i != hints.end()) {
41 return i->second;
42 }
43 // Does it start with the UTF-8 byte order mark? then guess it's UTF-8
44 if (length > 3 &&
45 bytes[0] == (unsigned char) 0xEF &&
46 bytes[1] == (unsigned char) 0xBB &&
47 bytes[2] == (unsigned char) 0xBF) {
48 return UTF8;
49 }
50 // For now, merely tries to distinguish ISO-8859-1, UTF-8 and Shift_JIS,
51 // which should be by far the most common encodings. ISO-8859-1
52 // should not have bytes in the 0x80 - 0x9F range, while Shift_JIS
53 // uses this as a first byte of a two-byte character. If we see this
54 // followed by a valid second byte in Shift_JIS, assume it is Shift_JIS.
55 // If we see something else in that second byte, we'll make the risky guess
56 // that it's UTF-8.
57 bool canBeISO88591 = true;
58 bool canBeShiftJIS = true;
59 bool canBeUTF8 = true;
60 int utf8BytesLeft = 0;
61 int maybeDoubleByteCount = 0;
62 int maybeSingleByteKatakanaCount = 0;
63 bool sawLatin1Supplement = false;
64 bool sawUTF8Start = false;
65 bool lastWasPossibleDoubleByteStart = false;
66
67 for (int i = 0;
68 i < length && (canBeISO88591 || canBeShiftJIS || canBeUTF8);
69 i++) {
70
71 int value = bytes
& 0xFF;
72
73 // UTF-8 stuff
74 if (value >= 0x80 && value <= 0xBF) {
75 if (utf8BytesLeft > 0) {
76 utf8BytesLeft--;
77 }
78 } else {
79 if (utf8BytesLeft > 0) {
80 canBeUTF8 = false;
81 }
82 if (value >= 0xC0 && value <= 0xFD) {
83 sawUTF8Start = true;
84 int valueCopy = value;
85 while ((valueCopy & 0x40) != 0) {
86 utf8BytesLeft++;
87 valueCopy <<= 1;
88 }
89 }
90 }
91
92 // ISO-8859-1 stuff
93
94 if ((value == 0xC2 || value == 0xC3) && i < length - 1) {
95 // This is really a poor hack. The slightly more exotic characters people might want to put in
96 // a QR Code, by which I mean the Latin-1 supplement characters (e.g. u-umlaut) have encodings
97 // that start with 0xC2 followed by [0xA0,0xBF], or start with 0xC3 followed by [0x80,0xBF].
98 int nextValue = bytes[i + 1] & 0xFF;
99 if (nextValue <= 0xBF &&
100 ((value == 0xC2 && nextValue >= 0xA0) || (value == 0xC3 && nextValue >= 0x80))) {
101 sawLatin1Supplement = true;
102 }
103 }
104 if (value >= 0x7F && value <= 0x9F) {
105 canBeISO88591 = false;
106 }
107
108 // Shift_JIS stuff
109
110 if (value >= 0xA1 && value <= 0xDF) {
111 // count the number of characters that might be a Shift_JIS single-byte Katakana character
112 if (!lastWasPossibleDoubleByteStart) {
113 maybeSingleByteKatakanaCount++;
114 }
115 }
116 if (!lastWasPossibleDoubleByteStart &&
117 ((value >= 0xF0 && value <= 0xFF) || value == 0x80 || value == 0xA0)) {
118 canBeShiftJIS = false;
119 }
120 if ((value >= 0x81 && value <= 0x9F) || (value >= 0xE0 && value <= 0xEF)) {
121 // These start double-byte characters in Shift_JIS. Let's see if it's followed by a valid
122 // second byte.
123 if (lastWasPossibleDoubleByteStart) {
124 // If we just checked this and the last byte for being a valid double-byte
125 // char, don't check starting on this byte. If this and the last byte
126 // formed a valid pair, then this shouldn't be checked to see if it starts
127 // a double byte pair of course.
128 lastWasPossibleDoubleByteStart = false;
129 } else {
130 // ... otherwise do check to see if this plus the next byte form a valid
131 // double byte pair encoding a character.
132 lastWasPossibleDoubleByteStart = true;
133 if (i >= length - 1) {
134 canBeShiftJIS = false;
135 } else {
136 int nextValue = bytes[i + 1] & 0xFF;
137 if (nextValue < 0x40 || nextValue > 0xFC) {
138 canBeShiftJIS = false;
139 } else {
140 maybeDoubleByteCount++;
141 }
142 // There is some conflicting information out there about which bytes can follow which in
143 // double-byte Shift_JIS characters. The rule above seems to be the one that matches practice.
144 }
145 }
146 } else {
147 lastWasPossibleDoubleByteStart = false;
148 }
149 }
150 if (utf8BytesLeft > 0) {
151 canBeUTF8 = false;
152 }
153
154 // Easy -- if assuming Shift_JIS and no evidence it can't be, done
155 if (canBeShiftJIS && ASSUME_SHIFT_JIS) {
156 return SHIFT_JIS;
157 }
158 if (canBeUTF8 && sawUTF8Start) {
159 return UTF8;
160 }
161 // Distinguishing Shift_JIS and ISO-8859-1 can be a little tough. The crude heuristic is:
162 // - If we saw
163 // - at least 3 bytes that starts a double-byte value (bytes that are rare in ISO-8859-1), or
164 // - over 5% of bytes could be single-byte Katakana (also rare in ISO-8859-1),
165 // - and, saw no sequences that are invalid in Shift_JIS, then we conclude Shift_JIS
166 if (canBeShiftJIS && (maybeDoubleByteCount >= 3 || 20 * maybeSingleByteKatakanaCount > length)) {
167 return SHIFT_JIS;
168 }
169 // Otherwise, we default to ISO-8859-1 unless we know it can't be
170 if (!sawLatin1Supplement && canBeISO88591) {
171 return ISO88591;
172 }
173 // Otherwise, we take a wild guess with platform encoding
174 return PLATFORM_DEFAULT_ENCODING;
175 }