日文字符的Unicode编码范围是:
U+3040–U+309F: Hiragana
U+30A0–U+30FF: Katakana
U+4E00–U+9FBF: Kanji
所以我们只需要对每一个字符判断其是否位于这三个区间即可。另外,如果你希望检测某个字符串是否含有简体中文字符字符串、繁体中文字符串等,只需要查询对应语言的Unicode编码范围,对下面的代码稍作改动即可。
Java code:
package com.hankcs;
import java.util.HashSet;
import java.util.Set;
public class Main
{
public static void main(String[] args)
{
// write your code here
Set japaneseUnicodeBlocks = new HashSet()
{{
add(Character.UnicodeBlock.HIRAGANA);
add(Character.UnicodeBlock.KATAKANA);
add(Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS);
}};
String mixed = "This is a Japanese newspaper headline: ラドクリフ、マラソン五輪代表に1万m出場にも含み";
for (char c : mixed.toCharArray())
{
if (japaneseUnicodeBlocks.contains(Character.UnicodeBlock.of(c)))
{
System.out.println(c + " is a Japanese character");
}
else
{
System.out.println(c + " is not a Japanese character");
}
}
}
}