Real'sHowTo AddThis Feed Button
Custom Search

Detect non-ASCII character in a StringTag(s): Internationalization String/Number


import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharacterCodingException;

public class StringUtils {
  
  public static boolean isPureAscii(String v) {
    byte bytearray []  = v.getBytes();
    CharsetDecoder d = Charset.forName("US-ASCII").newDecoder();
    try {
      CharBuffer r = d.decode(ByteBuffer.wrap(bytearray));
      r.toString();
    }
    catch(CharacterCodingException e) {
      return false;
    }
    return true;
  }

  public static void main (String args[])
    throws Exception {

     String test = "Réal";
     System.out.println(test + " isPureAscii() : " + StringUtils.isPureAscii(test));
     test = "Real";
     System.out.println(test + " isPureAscii() : " + StringUtils.isPureAscii(test));
     
     /*
      * output :
      *   Réal isPureAscii() : false
      *   Real isPureAscii() : true
      */
  }
}
A different (and simpler) approach is to take a given string and check if it's possible to encode it into ASCII.
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;

public class StringUtils {

  static CharsetEncoder asciiEncoder = 
      Charset.forName("US-ASCII").newEncoder(); // or "ISO-8859-1" for ISO Latin 1
  
  public static boolean isPureAscii(String v) {
    return asciiEncoder.canEncode(v);
  }

  public static void main (String args[])
    throws Exception {

     String test = "Réal";
     System.out.println(test + " isPureAscii() : " + StringUtils.isPureAscii(test));
     test = "Real";
     System.out.println(test + " isPureAscii() : " + StringUtils.isPureAscii(test));
     
     /*
      * output :
      *   Réal isPureAscii() : false
      *   Real isPureAscii() : true
      */
  }
}

Another way is to use a regular expression, see this Javascript HowTo for a hint!
To simply strip any non-ascii characters form a string
public class Test {
    public static void main(String args[]){
      String input = "eéaà";
      String output = input.replaceAll("[^\\p{ASCII}]", "");
      System.out.println(output);
      /*
       * output : ea
       */
    }
  }

See also Unaccent letters.
blog comments powered by Disqus


If you find this article useful, consider making a small donation
to show your support for this Web site and its content.

Written and compiled by Réal Gagnon ©1998-2013
[ home ]