/** * Copyright 2014 JogAmp Community. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are * permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this list of * conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, this list * of conditions and the following disclaimer in the documentation and/or other materials * provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY JogAmp Community ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JogAmp Community OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * The views and conclusions contained in the software and documentation are those of the * authors and should not be interpreted as representing official policies, either expressed * or implied, of JogAmp Community. */ package com.jogamp.common.util; import java.util.ArrayList; import java.util.List; /** Basic utility functions for {@link String} and {@link CharSequence} in general. */ public class StringUtil { /** Linefeed character unicode {@code '\n'}, 0x000A. */ public static final char LF = '\n'; /** CR character unicode {@code '\r'}, 0x000D. */ public static final char CR = '\r'; /** Space character unicode {@code ' '}, 0x0020. */ public static final char SPACE = ' '; /** * List of ASCII & Unicode space separator, aka {@code Whitespace}. * @see https://www.compart.com/en/unicode/category/Zs * @see https://en.wikipedia.org/wiki/Whitespace_character * @see https://www.unicode.org/reports/tr44/#General_Category_Values * @see ftp://ftp.unicode.org/Public/UNIDATA/Scripts.txt * @see https://www.w3schools.com/charsets/ref_utf_punctuation.asp */ public static final String WHITESPACE = String.valueOf("\t")+ // char tabulator String.valueOf(LF)+ // LF 0x000A String.valueOf(Character.toChars(0x000B))+ // line tab String.valueOf(Character.toChars(0x000C))+ // FF String.valueOf(CR)+ // CR 0x000D String.valueOf(SPACE)+ // SPACE 0x0020 String.valueOf(Character.toChars(0x0085))+ // Next Line String.valueOf(Character.toChars(0x00A0))+ // No-break space (NBSP) String.valueOf(Character.toChars(0x1680))+ // Ogham Space Mark String.valueOf(Character.toChars(0x2000))+ // En Quad String.valueOf(Character.toChars(0x2001))+ // Em Quad String.valueOf(Character.toChars(0x2002))+ // En Space String.valueOf(Character.toChars(0x2003))+ // Em Space String.valueOf(Character.toChars(0x2004))+ // Three-Per Em-Space String.valueOf(Character.toChars(0x2005))+ // Four-Per Em-Space String.valueOf(Character.toChars(0x2006))+ // Six-Per Em-Space String.valueOf(Character.toChars(0x2007))+ // Figure-Space String.valueOf(Character.toChars(0x2008))+ // Punctuation-Space String.valueOf(Character.toChars(0x2009))+ // Thin-Space String.valueOf(Character.toChars(0x200A))+ // Hair-Space String.valueOf(Character.toChars(0x202F))+ // Narrow No-break space (NNBSP) String.valueOf(Character.toChars(0x205F))+ // Medium Mathematical space (MMSP) String.valueOf(Character.toChars(0x3000)); // Ideographic Space /** Return true if given codepoint in included within {@link #WHITESPACE}. */ public static boolean isWhitespace(final int cp) { return 0 <= WHITESPACE.indexOf(cp); } /** * Returns true if given codepoint is a fullwidth unicode character. * @see https://www.compart.com/en/unicode/block/U+FF00 */ public static boolean isFullwidth(final int cp) { return 0xff01 <= cp && cp <= 0xff60; } /** * Returns true if given codepoint is a halfwidth unicode character. * @see https://www.compart.com/en/unicode/block/U+FF00 */ public static boolean isHalfwidth(final int cp) { return 0xff61 == cp && cp <= 0xffee; } /** Returns true if given codepoint is either {@link #isFullwidth(int)}, {@link #isHalfwidth(int)} or {@link #isWhitespace(int)}. */ public static boolean hasSpace(final int cp) { return isFullwidth(cp) || isHalfwidth(cp) || 0 <= WHITESPACE.indexOf(cp); } /** Returns number of lines, i.e. number of non-empty lines, separated by {@link #LF}. */ public static int getLineCount(final CharSequence s) { if( null == s ) { return 0; } final int len = s.length(); if( 0 == len ) { return 0; } int lc = 0; for (int i=0; len > i; ) { ++lc; final int j = indexOf(s, LF, i); if ( 0 > j ) { break; } i = j + 1; } return lc; } /** * Calls {@link String#indexOf(int, int)} * @param hay the unicode character string to search in from {@code fromIdx} * @param needle the unicode code point character to search * @param start index to start searching * @return {@code -1} if not found, otherwise [0..{@link String#length()}-1]. * @see #indexOf(CharSequence, int, int) * @see String#indexOf(int, int) */ public static int indexOf(final String hay, final int needle, final int start) { if( null != hay ) { return hay.indexOf(needle, start); } return -1; } /** * Naive implementation of {@link String#indexOf(int, int)} for type {@link CharSequence}. *
* Uses {@link String#indexOf(int, int)} if {@code hay} is of type {@link String}, * otherwise *
* @param hay the unicode character string to search in from {@code fromIdx} * @param needle the unicode code point character to search * @param start index to start searching * @return {@code -1} if not found, otherwise [0..{@link String#length()}-1]. * @see #indexOf(String, char, int) * @see String#indexOf(int, int) */ public static int indexOf(final CharSequence hay, final int needle, final int start) { if( null != hay ) { if (hay instanceof String) { return ((String) hay).indexOf(needle, start); } final int l = hay.length(); final int s = Math.max(0, start); if ( l > s ) { if (needle < Character.MIN_SUPPLEMENTARY_CODE_POINT) { for (int i = s; i < l; i++) { if (hay.charAt(i) == needle) { return i; } } } else if (Character.isValidCodePoint(needle)) { final char[] chars = Character.toChars(needle); for (int i = s; i < l - 1; i++) { final char hi = hay.charAt(i); // Character.toCodePoint(hi, lo); final char lo = hay.charAt(i + 1); if (hi == chars[0] && lo == chars[1]) { return i; } } } } } return -1; } /** * Remove all leading, trailing and duplicate-within {@code separators} unicode character from the {@code text}. ** Duplicate {@code separators} unicode character within the {@code text} are reduced to one occurrence * and might be replaced with {@code replacement} if not {@code null}. *
* @param text the source text * @param separators separator unicode characters, pass {@code null} for {@link Character#isWhitespace(int) whitespace}. * Consider using {@link #WHITESPACE} to cover all unicode space character. * @param replacement optional replacement string for matched separator within sequence removing duplicated. * If {@code null}, the first found separator is used. * @return stripped text */ public static String trim(final String text, final String separators, final String replacement) { if (text == null ) { return ""; } final int len = text.length(); if (len == 0) { return ""; } final StringBuilder sb = new StringBuilder(); boolean lastMatched = false; if (separators == null) { for(int i=0; i < len; ++i) { final int cp = text.codePointAt(i); final boolean match = Character.isWhitespace(cp); if ( !match || !lastMatched && 0 < i ) { if( match && null != replacement ) { sb.append(replacement); } else { sb.append(String.valueOf(Character.toChars(cp))); } } lastMatched = match; } if(lastMatched) { sb.deleteCharAt(sb.length()-1); } } else { for(int i=0; i < len; ++i) { final int cp = text.codePointAt(i); final boolean match = separators.indexOf(cp) >= 0; if ( !match || !lastMatched && 0 < i ) { if( match && null != replacement ) { sb.append(replacement); } else { sb.append(String.valueOf(Character.toChars(cp))); } } lastMatched = match; } if(lastMatched) { sb.deleteCharAt(sb.length()-1); } } return sb.toString(); } /** * Returns an array of split {@code text} at {@code separators} or {@link Character#isWhitespace(int) whitespace}. ** Each line's cutting point is the first {@code separator} or {@link Character#isWhitespace(int) whitespace} * occurrence starting at {@code text.length() / lineCount * 0.9}. *
** The separator or {@link Character#isWhitespace(int) whitespace} character * at the cutting point is skipped in the resulting array of the split parts, i.e. lines. *
* @param text the text to be split, {@code null} results in an empty list * @param lineCount number of resulting lines * @param separators separator unicode characters, pass {@code null} for {@link Character#isWhitespace(int) whitespace}. * Consider using {@link #WHITESPACE} to cover all unicode space character. * @see #split(String, int, String, String) */ public static List* Each line's cutting point is the first {@code separator} or {@link Character#isWhitespace(int) whitespace} * occurrence starting at {@code text.length() / lineCount * 0.9}. *
** The separator character or {@link Character#isWhitespace(int) whitespace} * at the cutting point is skipped in the string of glued split parts, i.e. lines. *
* @param text the text to be split, {@code null} results in an empty list * @param lineCount number of resulting lines * @param separators separator unicode characters, pass {@code null} for {@link Character#isWhitespace(int) whitespace}. * Consider using {@link #WHITESPACE} to cover all unicode space character. * @param lineSeparator the glue placed between the split lines in the concatenated result * @see #split(String, int, String) */ public static String split(final String text, final int lineCount, final String separators, final String lineSeparator) { final List