2016-11-18 12 views
3

Простой код алгоритма:Различные модели поведения алгоритма при работе с UTF8 на различных операционных системах

#include <iostream> 
#include <string> 

std::string::size_type GetLengthWithUTF(std::string &sValue); 

int main() 
{ 
    std::string sTestValueUTF8 = "\xD0\xB6\xD0\xB6\xD0\xB6"; 
    std::string sTestValueASCII = "\x67\x67\x67"; 
    std::string sTestValueMIX = "\x67\x67\x67\xD0\xB6\xD0\xB6\xD0\xB6"; 
    std::string::size_type iFuncResult = 0; 

    std::cout << "=========== START TEST ==========\n\n"; 

    std::cout << "+TEST UTF8 STRING\n"; 
    std::cout << "+----+Bytes of string (sTestValueUTF8.length()) = " << sTestValueUTF8.length() << "\n"; 
    iFuncResult = GetLengthWithUTF(sTestValueUTF8); 
    std::cout << "+----+Function result (GetLengthWithUTF(\"" << sTestValueUTF8 << "\")) = " << iFuncResult<< "\n\n"; 

    std::cout << "+TEST ASCII STRING\n"; 
    std::cout << "+----+Bytes of string (sTestValueASCII.length()) = " << sTestValueASCII.length() << "\n"; 
    iFuncResult = GetLengthWithUTF(sTestValueASCII); 
    std::cout << "+----+Function result (GetLengthWithUTF(\"" << sTestValueASCII << "\")) = " << iFuncResult<< "\n\n"; 

    std::cout << "+TEST MIX STRING\n"; 
    std::cout << "+----+Bytes of string (sTestValueMIX.length()) = " << sTestValueMIX.length() << "\n"; 
    iFuncResult = GetLengthWithUTF(sTestValueMIX); 
    std::cout << "+----+Function result (GetLengthWithUTF(\"" << sTestValueMIX << "\")) = " << iFuncResult<< "\n\n"; 

    std::cout << "\n=========== END TEST ==========\n\n"; 
} 

std::string::size_type GetLengthWithUTF(std::string &sValue) 
{ 
    std::cout << "  +----+START GetLengthWithUTF\n"; 
    std::cout << "   +Input string is: " << sValue << "\n"; 
    std::string::size_type i; 
    std::cout << "   +Start cycle\n"; 
    int iCountUTF8characters = 0; 
    for (i = 0; i < sValue.length(); i++) 
    { 
     std::cout << "   +----+Iteration N " << i << "\n"; 
     std::cout << "    +Current character is: " << sValue[i] << ", integer value = " << (int)sValue[i] << "\n"; 
     if (sValue[i] > 127) 
     { 
      iCountUTF8characters++; 
      std::cout << "    +----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: " << iCountUTF8characters << "\n"; 
     } 
     else 
     { 
      std::cout << "    +----+If statement (sValue[i] > 127) is false.\n"; 
     } 
    } 

    std::cout << "   +End cycle\n"; 
    iCountUTF8characters = iCountUTF8characters/2; 
    std::cout << "   +Return sValue.length() - (iCountUTF8characters/2) ---> " << sValue.length() << " - (" << iCountUTF8characters << "/2) = " << (sValue.length() - (std::string::size_type)iCountUTF8characters) <<"\n"; 
    std::cout << "  +----+ASCIID GetLengthWithUTF\n"; 
    return (sValue.length() - (std::string::size_type)iCountUTF8characters); 
} 

консоли компилировать команды:

AIX 6

g++ -o test test.cpp 

RHEL Сервер 6,7 Сантьяго

g++ -o test test.cpp 

Microsoft W INDOWS v10.0.14393

cl /EHsc test.cpp 



Результаты:

AIX 6

=========== START TEST ========== 

+TEST UTF8 STRING 
+----+Bytes of string (sTestValueUTF8.length()) = 6 
    +----+START GetLengthWithUTF 
      +Input string is: жжж 
      +Start cycle 
      +----+Iteration N 0 
       +Current character is: Ь integer value = 208 
       +----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 1 
      +----+Iteration N 1 
       +Current character is: ֬ integer value = 182 
       +----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 2 
      +----+Iteration N 2 
       +Current character is: Ь integer value = 208 
       +----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 3 
      +----+Iteration N 3 
       +Current character is: ֬ integer value = 182 
       +----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 4 
      +----+Iteration N 4 
       +Current character is: Ь integer value = 208 
       +----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 5 
      +----+Iteration N 5 
       +Current character is: ֬ integer value = 182 
       +----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 6 
      +End cycle 
      +Return sValue.length() - (iCountUTF8characters/2) ---> 6 - (3/2) = 3 
    +----+ASCIID GetLengthWithUTF 
+----+Function result (GetLengthWithUTF("жжж")) = 3 

+TEST ASCII STRING 
+----+Bytes of string (sTestValueASCII.length()) = 3 
    +----+START GetLengthWithUTF 
      +Input string is: ggg 
      +Start cycle 
      +----+Iteration N 0 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 1 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 2 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +End cycle 
      +Return sValue.length() - (iCountUTF8characters/2) ---> 3 - (0/2) = 3 
    +----+ASCIID GetLengthWithUTF 
+----+Function result (GetLengthWithUTF("ggg")) = 3 

+TEST MIX STRING 
+----+Bytes of string (sTestValueMIX.length()) = 9 
    +----+START GetLengthWithUTF 
      +Input string is: gggжжж 
      +Start cycle 
      +----+Iteration N 0 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 1 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 2 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 3 
       +Current character is: Ь integer value = 208 
       +----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 1 
      +----+Iteration N 4 
       +Current character is: ֬ integer value = 182 
       +----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 2 
      +----+Iteration N 5 
       +Current character is: Ь integer value = 208 
       +----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 3 
      +----+Iteration N 6 
       +Current character is: ֬ integer value = 182 
       +----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 4 
      +----+Iteration N 7 
       +Current character is: Ь integer value = 208 
       +----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 5 
      +----+Iteration N 8 
       +Current character is: ֬ integer value = 182 
       +----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 6 
      +End cycle 
      +Return sValue.length() - (iCountUTF8characters/2) ---> 9 - (3/2) = 6 
    +----+ASCIID GetLengthWithUTF 
+----+Function result (GetLengthWithUTF("gggжжж")) = 6 


=========== END TEST ========== 

RHEL Сервер 6,7 Сантьяго

=========== START TEST ========== 

+TEST UTF8 STRING 
+----+Bytes of string (sTestValueUTF8.length()) = 6 
    +----+START GetLengthWithUTF 
      +Input string is: жжж 
      +Start cycle 
      +----+Iteration N 0 
       +Current character is: Ь integer value = -48 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 1 
       +Current character is: ֬ integer value = -74 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 2 
       +Current character is: Ь integer value = -48 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 3 
       +Current character is: ֬ integer value = -74 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 4 
       +Current character is: Ь integer value = -48 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 5 
       +Current character is: ֬ integer value = -74 
       +----+If statement (sValue[i] > 127) is false. 
      +End cycle 
      +Return sValue.length() - (iCountUTF8characters/2) ---> 6 - (0/2) = 6 
    +----+ASCIID GetLengthWithUTF 
+----+Function result (GetLengthWithUTF("жжж")) = 6 

+TEST ASCII STRING 
+----+Bytes of string (sTestValueASCII.length()) = 3 
    +----+START GetLengthWithUTF 
      +Input string is: ggg 
      +Start cycle 
      +----+Iteration N 0 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 1 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 2 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +End cycle 
      +Return sValue.length() - (iCountUTF8characters/2) ---> 3 - (0/2) = 3 
    +----+ASCIID GetLengthWithUTF 
+----+Function result (GetLengthWithUTF("ggg")) = 3 

+TEST MIX STRING 
+----+Bytes of string (sTestValueMIX.length()) = 9 
    +----+START GetLengthWithUTF 
      +Input string is: gggжжж 
      +Start cycle 
      +----+Iteration N 0 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 1 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 2 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 3 
       +Current character is: Ь integer value = -48 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 4 
       +Current character is: ֬ integer value = -74 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 5 
       +Current character is: Ь integer value = -48 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 6 
       +Current character is: ֬ integer value = -74 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 7 
       +Current character is: Ь integer value = -48 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 8 
       +Current character is: ֬ integer value = -74 
       +----+If statement (sValue[i] > 127) is false. 
      +End cycle 
      +Return sValue.length() - (iCountUTF8characters/2) ---> 9 - (0/2) = 9 
    +----+ASCIID GetLengthWithUTF 
+----+Function result (GetLengthWithUTF("gggжжж")) = 9 


=========== END TEST ========== 

Microsoft Windows v10.0.14393

=========== START TEST ========== 

+TEST UTF8 STRING 
+----+Bytes of string (sTestValueUTF8.length()) = 6 
    +----+START GetLengthWithUTF 
      +Input string is: жжж 
      +Start cycle 
      +----+Iteration N 0 
       +Current character is: Ь integer value = -48 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 1 
       +Current character is: ֬ integer value = -74 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 2 
       +Current character is: Ь integer value = -48 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 3 
       +Current character is: ֬ integer value = -74 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 4 
       +Current character is: Ь integer value = -48 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 5 
       +Current character is: ֬ integer value = -74 
       +----+If statement (sValue[i] > 127) is false. 
      +End cycle 
      +Return sValue.length() - (iCountUTF8characters/2) ---> 6 - (0/2) = 6 
    +----+ASCIID GetLengthWithUTF 
+----+Function result (GetLengthWithUTF("жжж")) = 6 

+TEST ASCII STRING 
+----+Bytes of string (sTestValueASCII.length()) = 3 
    +----+START GetLengthWithUTF 
      +Input string is: ggg 
      +Start cycle 
      +----+Iteration N 0 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 1 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 2 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +End cycle 
      +Return sValue.length() - (iCountUTF8characters/2) ---> 3 - (0/2) = 3 
    +----+ASCIID GetLengthWithUTF 
+----+Function result (GetLengthWithUTF("ggg")) = 3 

+TEST MIX STRING 
+----+Bytes of string (sTestValueMIX.length()) = 9 
    +----+START GetLengthWithUTF 
      +Input string is: gggжжж 
      +Start cycle 
      +----+Iteration N 0 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 1 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 2 
       +Current character is: g, integer value = 103 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 3 
       +Current character is: Ь integer value = -48 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 4 
       +Current character is: ֬ integer value = -74 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 5 
       +Current character is: Ь integer value = -48 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 6 
       +Current character is: ֬ integer value = -74 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 7 
       +Current character is: Ь integer value = -48 
       +----+If statement (sValue[i] > 127) is false. 
      +----+Iteration N 8 
       +Current character is: ֬ integer value = -74 
       +----+If statement (sValue[i] > 127) is false. 
      +End cycle 
      +Return sValue.length() - (iCountUTF8characters/2) ---> 9 - (0/2) = 9 
    +----+ASCIID GetLengthWithUTF 
+----+Function result (GetLengthWithUTF("gggжжж")) = 9 


=========== END TEST ========== 

Алгоритм должен рассчитать количество символов в строке. Как видно из результатов тестов, он работает корректно только под AIX.

Буду рад, если кто-нибудь поможет мне понять это абсурдное для меня поведение алгоритма для разных операционных систем. Алгоритм был создан под ОС AIX. После перехода из AIX в LINUX обнаружено, что с ним возникла проблема, и я сделал более обширные тесты, результаты которых вы видите. Мой главный вопрос заключается в том, как работает алгоритм damn в AIX. Я не могу объяснить это логически.

+2

Этот алгоритм является неправильным; он будет работать только с небольшим подмножеством юникода. Лучшим альгиритом является подсчет количества байтов, так что 'ch & 0xC0! = 0x80', который исключает только не начальные коды (в диапазоне 0x80-0xBF). – rici

+0

Да, вы правы. Этот алгоритм является устаревшим, очень старым и проверяет строки под 200 символами. Однако он изменил алгоритм, описанный выше. Мне просто интересно узнать проблему. – stoyanov

ответ

4

Похоже, что два вида системы отличаются тем, как они обрабатывают знак символов, что допускается стандартом. Ваш компилятор AIX обрабатывает char s как unsigned, в то время как другие две системы рассматривают их как подписанные.

В системах с неподписанными символами условие sValue[i] > 127 ведет себя точно так, как этого можно было бы ожидать. Однако одно и то же выражение никогда не преуспевает в системах с подписанными символами.

Вот почему вы получаете отрицательные числа для символов с кодами 128 и выше. Например, 208 становится -48, когда он рассматривается как однобайтовое значение со знаком.

Вы можете это исправить, заставляя преобразование в подписи, или путем проверки на восемь бит в битовой маске:

if (sValue[i] & 128) { 
    ... // MSB is set 
} 
+0

Черт побери! Это верно! Большое спасибо!!! if (unsigned (sValue [i])> 127) - медленная версия, но более читаемая для некоторых разработчиков;) – stoyanov