Author Topic: Validate Japanese Half-Width(Hankaku) Katakana Character  (Read 11971 times)

Leo Chu

  • GuiXT Forum
  • Newbie
  • *
  • Posts: 16
    • View Profile
Validate Japanese Half-Width(Hankaku) Katakana Character
« on: September 08, 2016, 02:51:51 PM »
Liquid UI: Validate Japanese Half-Width(Hankaku) Katakana Character

This example is to create the logic validate if there's any Half-width (Hankaku) Katakana character in the user input.
From Unicode, the Half-width Katakana characters are within certain range.
The logic is to convert all characters into Unicode then determine if any result is within the range of Half-width Katakana.



Step 1: Create user interface
//User interface

//Delete all existing pushbuttons on toolbar
del("P[User menu]");
del("P[SAP menu]");
del("P[SAP Business Workplace]");
del("P[Other menu]");
del("P[Add to Favorites]");
del("P[Delete Favorites]");
del("P[Change Favorites]");
del("P[Move Favorites down]");
del("P[Move Favorites up]");
del("P[Create role]");
del("P[Assign users]");
del("P[Documentation]");

clearscreen();

inputfield([2,2], "Input Data", [2,20], {"name":"z_input", "size":40});

pushbutton([TOOLBAR], "Validate Input", "?", {"process":validateHalfWidthKatakana});



Step 2: Create function to validate the converted unicode result
//Function to validate the converted unicode result
function validateHalfWidthKatakana(){
      
      onscreen "*"
            var splited_ary = getSplitedTextAry(z_input);            //Logic to split the input data into an array by actual characters
            
            var regular_char_counter = 0;
            var half_width_char_counter = 0;
            var full_width_char_counter = 0;
            var other_char_counter = 0;
            var cur_unicode = "";
            
            //Logic to calculate how many Half-Width Katakana/regular character/number/syntax in the input data
            for(var k=0; k<splited_ary.length; k++){
                  
                  if(splited_ary[k].length > 1){
                        cur_unicode = getUnicode(splited_ary[k]);      //Logic to get Unicode for each character
                        
                        if(      (cur_unicode >= "3000" && cur_unicode <= "30ff") ||             //Punctuation, Hiragana, Katakana
                                 (cur_unicode >= "FF00" && cur_unicode <= "FF9F") ||             //Full-width Roman, Half-width Katakana
                                 (cur_unicode >= "4E00" && cur_unicode <= "9FAF") ||             //CJK (Common & Uncommon)
                                 (cur_unicode >= "3400" && cur_unicode <= "4DBF")){                  //CJK Ext. A (Rare)
                              if(cur_unicode >= "FF61" && cur_unicode <= "FF9F"){            //It's a Half width katakana (Hankaku)
                                    half_width_char_counter++;
                              } else{
                                    //It's either Punctuation, Hiragana, Katakana, Full-width Roman, CJK, or CJK Ext. A
                                    full_width_char_counter++;
                              }
                        } else {
                              other_char_counter++;      //Unhandled characters
                        }
                  } else {
                        regular_char_counter++;            //It's regular character/number/syntax
                  }
            }
            
            println("=====>> Half-Width count: "+half_width_char_counter);
            println("=====>> Full-Width count: "+full_width_char_counter);
            
            message("S:Input Data contain " + half_width_char_counter + " Half-Width Katakana");
            
            enter("?");
}



Step 3: Create function to splite the string based on actual characters
//Function to return an array with splitted text
function getSplitedTextAry(str){
      var result_ary = [];
      var ref_str = str;
      
      var converted_str = encodeURI(str);            //Converted the string becomes encoded result
      var converted_str_ary = [];
      var ref_str_ary = [];
      
      //Loop until the converted string becomes nothing
      while(converted_str.length > 0){
            //If Unicode character is found in the string
            if(converted_str.indexOf("%") > -1){
                  //If Unicode character is not from the first character
                  if(converted_str.indexOf("%") != 0){            
                        converted_str_ary.push(converted_str.slice(0,converted_str.indexOf("%")));
                        ref_str_ary.push(ref_str.slice(0,converted_str.indexOf("%")));
                        
                        ref_str = ref_str.slice(converted_str.indexOf("%"));
                        converted_str = converted_str.slice(converted_str.indexOf("%"));
                  }
                  //When Unicode character is from the first character
                  else {
                        converted_str_ary.push(converted_str.slice(0,18));            //Every actual character are 18 char long after encoded
                        ref_str_ary.push(ref_str.slice(0,3));                              //Every actual character are 3 char long before encoded
                        
                        converted_str = converted_str.slice(18);            //Subtract the string
                        ref_str = ref_str.slice(3);                                    //Subtract the string
                  }
            }
            //When Unicode character is not found in the string
            else {
                  converted_str_ary.push(converted_str);            //Push the rest of the string
                  ref_str_ary.push(ref_str);                              //Push the rest of the string
                  converted_str = "";                                          //Clear string
            }
      }
      
      //Reform the return string
      for(var k=0; k<converted_str_ary.length; k++){
            if(converted_str_ary[k].indexOf("%") < 0){                  //If it's a regular character/string
                  for(var j=0; j<ref_str_ary[k].length; j++){            //Push individual characters into the array
                        result_ary.push(ref_str_ary[k].charAt(j));
                  }
            } else {                                                                  //If it's a multi-byte character
                  result_ary.push(ref_str_ary[k]);                        //Push entire multi-byte character into the array
            }
      }
      
      return result_ary;
}

Note: This logic is required to recognize individual actuual character in the string.
            Here we only consider the 3-byte characters for UTF-8 encoding.


            

Step 4: Create function to converted multi-byte character to its Unicode value
//Function to return Unicode in string
function getUnicode(str){
      var result_ary = [];
      
      //Logic to get the actual byte value for each unencoded character
      for(var k=0; k<str.length; k++){      
            println("===>>"+str.charCodeAt(k).toString(2).substring(24,32)+"<==");            
            result_ary.push(str.charCodeAt(k).toString(2).substring(24,32));
      }
      
      //Logic to form multi-byte data become 16-bit hex value
      switch(result_ary.length){
            case 1:            // U+00000000 - U+0000007F  0xxxxxxx
                  var ucode = "";
                  break;
            
            case 2:            // U+00000080 - U+000007FF  110xxxxx 10xxxxxx
                  var ucode = "";
                  break;
            
            case 3:            //U+00000800 - U+0000FFFF   1110xxxx 10xxxxxx 10xxxxxx
                  var c1 = parseInt(result_ary[0],2); 
                  var c2 = parseInt(result_ary[1],2); 
                  var c3 = parseInt(result_ary[2],2);
                  
                  var b1 = (c1 << 4) | ((c2 >> 2) & 0x0F); 
                  var b2 = ((c2 & 0x03) << 6) | (c3 & 0x3F);
                  var ucode = ((b1 & 0x00FF) << 8) | b2;
                  break;
            
            case 4:            // U+00010000 - U+001FFFFF  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
                  var ucode = "";
                  break;
            
            case 5:            // U+00200000 - U+03FFFFFF  111110xx 10xxxxxx 10xxxxxx 10xxxxx
                  var ucode = "";
                  break;
            
            case 6:            // U+04000000 - U+7FFFFFFF  1111110x 10xxxxxx 10xxxxxx 10xxxxx
                  var ucode = "";
                  break;
      }
      
      println("===>>Unicode="+ucode.toString(16).toUpperCase()+"<==");
      
      return ucode.toString(16).toUpperCase();            //Return Unicode value in hex
}

Note: Unicode conversion should include 6 different cases.
            Here we only consider the 3-byte characters for UTF-8 encoding.




See attachments for code samples
« Last Edit: June 20, 2017, 01:48:07 PM by Leo Chu »