Author Topic: Validate Japanese Half-Width(Hankaku) Katakana Character (Read 11971 times)

Leo Chu · « **on:** September 08, 2016, 02:51:51 PM »

Liquid UI: Validate Japanese Half-Width(Hankaku) Katakana Character

This example is to create the logic validate if there's any Half-width (Hankaku) Katakana character in the user input.
From Unicode, the Half-width Katakana characters are within certain range.
The logic is to convert all characters into Unicode then determine if any result is within the range of Half-width Katakana.

Step 1: Create user interface
//User interface

//Delete all existing pushbuttons on toolbar
del("P[User menu]");
del("P[SAP menu]");
del("P[SAP Business Workplace]");
del("P[Other menu]");
del("P[Add to Favorites]");
del("P[Delete Favorites]");
del("P[Change Favorites]");
del("P[Move Favorites down]");
del("P[Move Favorites up]");
del("P[Create role]");
del("P[Assign users]");
del("P[Documentation]");

clearscreen();

inputfield([2,2], "Input Data", [2,20], {"name":"z_input", "size":40});

pushbutton([TOOLBAR], "Validate Input", "?", {"process":validateHalfWidthKatakana});

Step 2: Create function to validate the converted unicode result
//Function to validate the converted unicode result
function validateHalfWidthKatakana(){

      onscreen "*"
            var splited_ary = getSplitedTextAry(z_input);            //Logic to split the input data into an array by actual characters

            var regular_char_counter = 0;
            var half_width_char_counter = 0;
            var full_width_char_counter = 0;
            var other_char_counter = 0;
            var cur_unicode = "";

            //Logic to calculate how many Half-Width Katakana/regular character/number/syntax in the input data
            for(var k=0; k<splited_ary.length; k++){

                  if(splited_ary[k].length > 1){
                        cur_unicode = getUnicode(splited_ary[k]);      //Logic to get Unicode for each character

                        if(      (cur_unicode >= "3000" && cur_unicode <= "30ff") ||             //Punctuation, Hiragana, Katakana
                                 (cur_unicode >= "FF00" && cur_unicode <= "FF9F") ||             //Full-width Roman, Half-width Katakana
                                 (cur_unicode >= "4E00" && cur_unicode <= "9FAF") ||             //CJK (Common & Uncommon)
                                 (cur_unicode >= "3400" && cur_unicode <= "4DBF")){                  //CJK Ext. A (Rare)
                              if(cur_unicode >= "FF61" && cur_unicode <= "FF9F"){            //It's a Half width katakana (Hankaku)
                                    half_width_char_counter++;
                              } else{
                                    //It's either Punctuation, Hiragana, Katakana, Full-width Roman, CJK, or CJK Ext. A
                                    full_width_char_counter++;
                              }
                        } else {
                              other_char_counter++;      //Unhandled characters
                        }
                  } else {
                        regular_char_counter++;            //It's regular character/number/syntax
                  }
            }

            println("=====>> Half-Width count: "+half_width_char_counter);
            println("=====>> Full-Width count: "+full_width_char_counter);

            message("S:Input Data contain " + half_width_char_counter + " Half-Width Katakana");

            enter("?");
}

Step 3: Create function to splite the string based on actual characters
//Function to return an array with splitted text
function getSplitedTextAry(str){
      var result_ary = [];
      var ref_str = str;

      var converted_str = encodeURI(str);            //Converted the string becomes encoded result
      var converted_str_ary = [];
      var ref_str_ary = [];

      //Loop until the converted string becomes nothing
      while(converted_str.length > 0){
            //If Unicode character is found in the string
            if(converted_str.indexOf("%") > -1){
                  //If Unicode character is not from the first character
                  if(converted_str.indexOf("%") != 0){
                        converted_str_ary.push(converted_str.slice(0,converted_str.indexOf("%")));
                        ref_str_ary.push(ref_str.slice(0,converted_str.indexOf("%")));

                        ref_str = ref_str.slice(converted_str.indexOf("%"));
                        converted_str = converted_str.slice(converted_str.indexOf("%"));
                  }
                  //When Unicode character is from the first character
                  else {
                        converted_str_ary.push(converted_str.slice(0,18));            //Every actual character are 18 char long after encoded
                        ref_str_ary.push(ref_str.slice(0,3));                              //Every actual character are 3 char long before encoded

                        converted_str = converted_str.slice(18);            //Subtract the string
                        ref_str = ref_str.slice(3);                                    //Subtract the string
                  }
            }
            //When Unicode character is not found in the string
            else {
                  converted_str_ary.push(converted_str);            //Push the rest of the string
                  ref_str_ary.push(ref_str);                              //Push the rest of the string
                  converted_str = "";                                          //Clear string
            }
      }

      //Reform the return string
      for(var k=0; k<converted_str_ary.length; k++){
            if(converted_str_ary[k].indexOf("%") < 0){                  //If it's a regular character/string
                  for(var j=0; j<ref_str_ary[k].length; j++){            //Push individual characters into the array
                        result_ary.push(ref_str_ary[k].charAt(j));
                  }
            } else {                                                                  //If it's a multi-byte character
                  result_ary.push(ref_str_ary[k]);                        //Push entire multi-byte character into the array
            }
      }

      return result_ary;
}

Note: This logic is required to recognize individual actuual character in the string.
            Here we only consider the 3-byte characters for UTF-8 encoding.



Step 4: Create function to converted multi-byte character to its Unicode value
//Function to return Unicode in string
function getUnicode(str){
      var result_ary = [];

      //Logic to get the actual byte value for each unencoded character
      for(var k=0; k<str.length; k++){
            println("===>>"+str.charCodeAt(k).toString(2).substring(24,32)+"<==");
            result_ary.push(str.charCodeAt(k).toString(2).substring(24,32));
      }

      //Logic to form multi-byte data become 16-bit hex value
      switch(result_ary.length){
            case 1:            // U+00000000 - U+0000007F 0xxxxxxx
                  var ucode = "";
                  break;

            case 2:            // U+00000080 - U+000007FF 110xxxxx 10xxxxxx
                  var ucode = "";
                  break;

            case 3:            //U+00000800 - U+0000FFFF 1110xxxx 10xxxxxx 10xxxxxx
                  var c1 = parseInt(result_ary[0],2);
                  var c2 = parseInt(result_ary[1],2);
                  var c3 = parseInt(result_ary[2],2);

                  var b1 = (c1 << 4) | ((c2 >> 2) & 0x0F);
                  var b2 = ((c2 & 0x03) << 6) | (c3 & 0x3F);
                  var ucode = ((b1 & 0x00FF) << 8) | b2;
                  break;

            case 4:            // U+00010000 - U+001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
                  var ucode = "";
                  break;

            case 5:            // U+00200000 - U+03FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxx
                  var ucode = "";
                  break;

            case 6:            // U+04000000 - U+7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxx
                  var ucode = "";
                  break;
      }

      println("===>>Unicode="+ucode.toString(16).toUpperCase()+"<==");

      return ucode.toString(16).toUpperCase();            //Return Unicode value in hex
}

Note: Unicode conversion should include 6 different cases.
            Here we only consider the 3-byte characters for UTF-8 encoding.

See attachments for code samples

News:

Author Topic: Validate Japanese Half-Width(Hankaku) Katakana Character (Read 11971 times)

Leo Chu

Validate Japanese Half-Width(Hankaku) Katakana Character