How to map SQL collation setting to a Java comparator? -
is there way translate database's collation setting (e.g. sql_latin1_general_cp1_ci_as
) java comparator
implementation can apply same ordering database does, using in java code?
is there existing library provides mapping?
i ended doing following:
- query current database's collation setting.
- next, parse description of collator sub-components such "case-insensitive" or "accent-sensitive".
- next, construct
comparator
corresponding these rules
enjoy!
/** * returns comparator associated database's default collation. * <p> * beware! <a href="http://stackoverflow.com/a/361059/14731">some databases</a> sort unicode strings differently * non-unicode strings, same collation setting. * <p> * @param unicode true if string being sorted unicode, false otherwise * @return comparator associated database's default collation * @throws databaseexception if unexpected database error occurs */ public comparator<string> getcomparator(boolean unicode) throws databaseexception { // @see http://stackoverflow.com/a/5072926/14731, http://stackoverflow.com/a/27052010/14731 , // http://stackoverflow.com/q/32209137/14731 try (connection connection = server.getdatasource().getconnection()) { try (preparedstatement statement = connection.preparestatement( "select description sys.fn_helpcollations()\n" + "where name = serverproperty('collation')")) { try (resultset rs = statement.executequery()) { if (!rs.next()) throw new objectnotfoundexception(this); string description = rs.getstring(1); list<string> tokens = arrays.aslist(description.split(",\\s*")); // description format: language,property1,property2,...,propertyn,sorting,... comparatorbuilder comparatorbuilder = new comparatorbuilder(); // skip language tokens = tokens.sublist(1, tokens.size()); // see https://technet.microsoft.com/en-us/library/ms143515(v=sql.90).aspx list of possible tokens (string token: tokens) { if (token.tolowercase().contains("sort")) { // stop hit information related sorting order break; } switch (token) { case "case-insensitive": { comparatorbuilder.caseinsensitive(true); break; } case "accent-insensitive": { comparatorbuilder.accentinsensitive(true); break; } case "kanatype-insensitive": { comparatorbuilder.kanainsensitive(true); break; } case "width-insensitive": case "width-insensitive unicode data": { comparatorbuilder.widthinsensitive(true); break; } case "case-sensitive": case "accent-sensitive": case "kanatype-sensitive": case "width-sensitive": { // nothing, default setting. break; } default: throw new assertionerror(string.format("unexpected token: '%s'. description: '%s'", token, description)); } } assert (!rs.next()): "database returned more rows expected"; if (unicode) comparatorbuilder.discardhyphens(true); return comparatorbuilder.build(); } } } catch (sqlexception e) { throw new databaseexception(e); } } import com.ibm.icu.text.transliterator; import java.text.normalizer; import java.util.comparator; /** * converts database collation java comparator. * <p> * @see https://msdn.microsoft.com/en-us/library/hh230914.aspx?f=255&mspperror=-2147217396 * @see http://zarez.net/?p=1893 * @author gili tzabari */ class comparatorbuilder { // sql server: https://technet.microsoft.com/en-us/library/ms143515(v=sql.90).aspx private boolean caseinsensitive = false; private boolean accentinsensitive = false; private boolean kanainsensitive = false; private boolean widthinsensitive = false; /** * indicates if hyphens should discarded prior sorting (default = false). */ private boolean discardhyphens = false; /** * @return true if comparator ignores difference between uppercase , lowercase letters (default = false) */ public boolean caseinsensitive() { return caseinsensitive; } /** * @param value true if comparator ignores difference between uppercase , lowercase letters * @return */ public comparatorbuilder caseinsensitive(boolean value) { this.caseinsensitive = value; return this; } /** * @return true if comparator ignores difference between accented , unaccented characters (default = false) */ public boolean accentinsensitive() { return accentinsensitive; } /** * @param value true if comparator ignores difference between accented , unaccented characters * @return */ public comparatorbuilder accentinsensitive(boolean value) { this.accentinsensitive = value; return this; } /** * @return true if comparator ignores difference between 2 types of japanese kana characters: hiragana * , katakana (default = false) */ public boolean kanainsensitive() { return kanainsensitive; } /** * @param value true if comparator ignores difference between 2 types of japanese kana characters: * hiragana , katakana * @return */ public comparatorbuilder kanainsensitive(boolean value) { this.kanainsensitive = value; return this; } /** * @return true if comparator ignores difference between single-byte character , same character when * represented double-byte character (default = false) */ public boolean widthinsensitive() { return widthinsensitive; } /** * @param value true if comparator ignores difference between single-byte character , same character * when represented double-byte character * @return */ public comparatorbuilder widthinsensitive(boolean value) { this.widthinsensitive = value; return this; } /** * @return true if comparator discards hyphens prior sorting (default = false) */ public boolean discardhyphens() { return discardhyphens; } /** * @param value true if comparator discards hyphens prior sorting * @return */ public comparatorbuilder discardhyphens(boolean value) { this.discardhyphens = value; return this; } /** * @return comparator instance */ public comparator<string> build() { return (java.lang.string first, java.lang.string second) -> { string firstnormalized = first; string secondnormalized = second; if (discardhyphens) { firstnormalized = firstnormalized.replaceall("-", ""); secondnormalized = secondnormalized.replaceall("-", ""); } if (accentinsensitive) { // @see http://stackoverflow.com/a/3322174/14731 firstnormalized = normalizer.normalize(first, normalizer.form.nfd).replaceall("[^\\p{ascii}]", ""); secondnormalized = normalizer.normalize(second, normalizer.form.nfd).replaceall("[^\\p{ascii}]", ""); } if (kanainsensitive) { // @see http://stackoverflow.com/a/6577778/14731 transliterator transliterator = transliterator.getinstance("hiragana-katakana"); firstnormalized = transliterator.transliterate(firstnormalized); secondnormalized = transliterator.transliterate(secondnormalized); } if (widthinsensitive) { transliterator transliterator = transliterator.getinstance("halfwidth-fullwidth"); firstnormalized = transliterator.transliterate(firstnormalized); secondnormalized = transliterator.transliterate(secondnormalized); } // case-normalization not easy seems. see // http://mattryall.net/blog/2009/02/the-infamous-turkish-locale-bug , implementation of // string.comparetoignorecase(). better delegate trusted implementation. if (caseinsensitive) return firstnormalized.comparetoignorecase(secondnormalized); else return firstnormalized.compareto(secondnormalized); }; } }
Comments
Post a Comment