diff --git a/src/main/java/org/apache/commons/text/StringTokenizer.java b/src/main/java/org/apache/commons/text/StringTokenizer.java index 33ea73cad5..591f4b208f 100644 --- a/src/main/java/org/apache/commons/text/StringTokenizer.java +++ b/src/main/java/org/apache/commons/text/StringTokenizer.java @@ -239,6 +239,9 @@ public static StringTokenizer getTSVInstance(final String input) { /** Whether to ignore empty tokens. */ private boolean ignoreEmptyTokens = true; + /** Whether to omit delimiter matches from output. */ + private boolean omitDelimiterMatches = true; + /** * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer, but with no text to * tokenize. @@ -751,8 +754,11 @@ private int readNextToken(final char[] srcChars, int start, final int len, final // handle empty token final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len); if (delimLen > 0) { - addToken(tokenList, StringUtils.EMPTY); - return start + delimLen; + //empty token is not possible if we are including delimiters in token + if (omitDelimiterMatches) { + addToken(tokenList, StringUtils.EMPTY); + return start + delimLen; + } } // handle found token @@ -826,7 +832,14 @@ private int readWithQuotes(final char[] srcChars, final int start, final int len if (delimLen > 0) { // return condition when end of token found addToken(tokenList, workArea.substring(0, trimStart)); - return pos + delimLen; + if (omitDelimiterMatches) { + return pos + delimLen; + } else { + //increment position only if we found a new delimiter + if (pos > start) { + return pos; + } + } } // check for quote, and thus back into quoting mode @@ -1021,6 +1034,17 @@ public StringTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) { return this; } + /** + * Sets whether the tokenizer should omit the delimiter matches from the output tokens. Default is true. + * + * @param omitDelimiterMatches whether delimiter matches are omitted + * @return this, to enable chaining + */ + public StringTokenizer setOmitDelimiterMatches(final boolean omitDelimiterMatches) { + this.omitDelimiterMatches = omitDelimiterMatches; + return this; + } + /** * Sets the quote character to use. *
diff --git a/src/main/java/org/apache/commons/text/TokenFormatter.java b/src/main/java/org/apache/commons/text/TokenFormatter.java new file mode 100644 index 0000000000..45609e837e --- /dev/null +++ b/src/main/java/org/apache/commons/text/TokenFormatter.java @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text; + +public interface TokenFormatter { + String format(char[] prior, int tokenIndex, char[] token); +} diff --git a/src/main/java/org/apache/commons/text/TokenFormatterFactory.java b/src/main/java/org/apache/commons/text/TokenFormatterFactory.java new file mode 100644 index 0000000000..b693301527 --- /dev/null +++ b/src/main/java/org/apache/commons/text/TokenFormatterFactory.java @@ -0,0 +1,131 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text; + +import org.apache.commons.lang3.StringUtils; + +public class TokenFormatterFactory { + + /** + * Token formatter that returns the token as is. + */ + public static class NoOpFormatter implements TokenFormatter { + @Override + public String format(char[] prior, int tokenIndex, char[] token) { + return new String(token); + } + + } + + /** + * Token formatter that always returns a constant string, and optionally checks the passed in token + * for the constant and throws an error when found. + */ + public static class ConstantTokenFormatter implements TokenFormatter { + + /** + * The constant to return. + */ + private char[] constant; + + /** + * Whether or not to throw an exception if the constant is found. + */ + private boolean failOnConstantFound = true; + + public ConstantTokenFormatter(char constant) { + this(new char[] {constant}, true); + } + + public ConstantTokenFormatter(char constant, boolean failOnConstantFound) { + this(new char[] {constant}, failOnConstantFound); + } + + public ConstantTokenFormatter(String constant) { + this(constant, true); + } + + public ConstantTokenFormatter(String constant, boolean failOnConstantFound) { + this(constant.toCharArray(), failOnConstantFound); + } + + public ConstantTokenFormatter(char[] constant, boolean failOnConstantFound) { + this.constant = constant; + this.failOnConstantFound = failOnConstantFound; + } + + @Override + public String format(char[] prior, int tokenIndex, char[] token) { + if (failOnConstantFound) { + int end = token.length - (constant.length - 1); + for (int i = 0; i < end; i++) { + boolean match = false; + int t = i; + for (int j = 0; j < constant.length; j++) { + if (token[t] == constant[j]) { + match = true; + } else { + match = false; + break; + } + t++; + } + if (match) { + throw new IllegalArgumentException("Token " + tokenIndex + " contains illegal character '" + new String(constant) + "' at index " + t); + } + } + } + + return new String(constant); + } + + /** + * Set whether to check the token for the constant. + * @param checkTokenForConstant whether to check. + */ + public void setFailOnConstantFound(boolean checkTokenForConstant) { + this.failOnConstantFound = checkTokenForConstant; + } + + } + + /** + * Reusable NoOpFormatter instance. + */ + private static final NoOpFormatter NOOP_FORMATTER = new NoOpFormatter(); + + /** + * Reusable Empty String formatter instance. + */ + private static final ConstantTokenFormatter EMPTY_STRING_FORMATTER = new ConstantTokenFormatter(StringUtils.EMPTY, false); + + public static NoOpFormatter noOpFormatter() { + return NOOP_FORMATTER; + } + + public static ConstantTokenFormatter constantFormatter(char[] constant, boolean failOnConstant) { + return new ConstantTokenFormatter(constant, failOnConstant); + } + + public static ConstantTokenFormatter constantFormatter(char constant, boolean failOnConstant) { + return new ConstantTokenFormatter(constant, failOnConstant); + } + + public static ConstantTokenFormatter emptyFormatter() { + return EMPTY_STRING_FORMATTER; + } +} diff --git a/src/main/java/org/apache/commons/text/TokenStringifier.java b/src/main/java/org/apache/commons/text/TokenStringifier.java new file mode 100644 index 0000000000..ee4e902de1 --- /dev/null +++ b/src/main/java/org/apache/commons/text/TokenStringifier.java @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text; + +/** + * Takes a collection of String tokens and combines them into a single String. + *
+ * This class functions as the inverse of {@link org.apache.commons.text.StringTokenizer}. All tokens are formatted + * by a {@link TokenFormatter} which allows fine grained control over the final output. + *
+ */ +public class TokenStringifier { + + /** + * The formatter for the delimiter. + */ + private TokenFormatter delimiterFormatter; + + /** + * The formatter for the tokens. + */ + private TokenFormatter tokenFormatter; + + /** + * Builder used to hold formatted tokens. + */ + private StringBuilder builder; + + /** + * The final string. + */ + private String string; + + /** + * The tokens to turn into a String. + */ + private Iterable+ * CamelCase is a case where tokens are delimited by upper case Unicode characters. The very first + * token should begin with a lower case character, and any subsequent tokens begin with an + * upper case character. All remaining characters will be lower case or non cased. + *
+ */ +public final class CamelCase extends UpperCaseDelimitedCase { + + /** Constant reusable instance of this case. */ + public static final CamelCase INSTANCE = new CamelCase(); + + /** + * Constructs new CamelCase instance. + */ + private CamelCase() { + super(true); + } + +} diff --git a/src/main/java/org/apache/commons/text/cases/Case.java b/src/main/java/org/apache/commons/text/cases/Case.java new file mode 100644 index 0000000000..c9be5e2d48 --- /dev/null +++ b/src/main/java/org/apache/commons/text/cases/Case.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.cases; + +import java.util.List; + +/** + * Formats and parses tokens to/from a String. In most implementations tokens returned + * by the parse method abide by any restrictions present in the format method. That is, calling + * format() with the results of a call to parse() on the same Case instance should return a + * matching String. + * + * @since 1.11 + */ +public interface Case { + + /** + * Formats a set of tokens into a string. The tokens do not necessarily have to meet the syntax + * requirements of the Case. The documentation for each implementation should specify what input + * is supported. + * + * @param tokens string tokens to be formatted by this Case + * @return the formatted string + * @throws IllegalArgumentException if tokens cannot be formatted + */ + String format(Iterable+ * KebabCase is a delimited case where the delimiter is a hyphen character '-'. + *
+ */ +public final class KebabCase extends CharacterDelimitedCase { + + /** Constant for delimiter. */ + private static final char DELIMITER = '-'; + + /** Constant reusable instance of this case. */ + public static final KebabCase INSTANCE = new KebabCase(); + + /** + * Constructs a new KebabCase instance. + */ + private KebabCase() { + super(DELIMITER); + } + +} diff --git a/src/main/java/org/apache/commons/text/cases/PascalCase.java b/src/main/java/org/apache/commons/text/cases/PascalCase.java new file mode 100644 index 0000000000..3d298cdb37 --- /dev/null +++ b/src/main/java/org/apache/commons/text/cases/PascalCase.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.cases; + +/** + * Case implementation which parses and formats strings of the form 'MyPascalString' + *+ * PascalCase tokens are delimited by upper case Unicode characters. Each parsed token + * begins with an upper case character, and remaining token characters are either lower case or non cased. + *
+ */ +public final class PascalCase extends UpperCaseDelimitedCase { + + /** Constant reusable instance of this case. */ + public static final PascalCase INSTANCE = new PascalCase(); + + /** + * Constructs a new PascalCase instance. + */ + private PascalCase() { + super(false); + } + +} diff --git a/src/main/java/org/apache/commons/text/cases/PascalTokenFormatter.java b/src/main/java/org/apache/commons/text/cases/PascalTokenFormatter.java new file mode 100644 index 0000000000..d9874a9f77 --- /dev/null +++ b/src/main/java/org/apache/commons/text/cases/PascalTokenFormatter.java @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.cases; + +import org.apache.commons.text.TokenFormatter; + +public class PascalTokenFormatter implements TokenFormatter { + + /** + * Whether or not to set the first character of the first token as lower case. + */ + private boolean lowerCaseFirstCharacter = false; + + public PascalTokenFormatter(boolean lowerCaseFirstCharacter) { + this.lowerCaseFirstCharacter = lowerCaseFirstCharacter; + } + + public PascalTokenFormatter() { } + + @Override + public String format(char[] prior, int tokenIndex, char[] token) { + if (token == null || token.length == 0) { + throw new IllegalArgumentException("Unsupported empty token at index " + tokenIndex); + } + StringBuilder formattedString = new StringBuilder(); + + for (int i = 0; i < token.length;) { + final int codePoint = Character.codePointAt(token, i); + //final int codePoint = token.codePointAt(i); + int codePointFormatted = codePoint; + if (i == 0 && tokenIndex == 0 && lowerCaseFirstCharacter) { + codePointFormatted = toLowerCase(codePoint); + } else if (i == 0) { + codePointFormatted = toUpperCase(codePoint); + } else if (Character.isUpperCase(codePointFormatted) || Character.isTitleCase(codePointFormatted)) { + //if character is title or upper case, it must be converted to lower + codePointFormatted = toLowerCase(codePoint); + } + formattedString.appendCodePoint(codePointFormatted); + i += Character.charCount(codePoint); + } + + return formattedString.toString(); + } + + /** + * Transforms a Unicode code point into upper case using {@link java.lang.Character#toUpperCase} and confirms the + * result is upper case. + * + * @param codePoint the code point to upper case + * @return the transformed code point + * @throws IllegalArgumentException if the converted code point cannot be mapped into an upper case character + */ + private static int toUpperCase(int codePoint) { + int codePointFormatted = Character.toUpperCase(codePoint); + if (!Character.isUpperCase(codePointFormatted)) { + throw new IllegalArgumentException(createExceptionMessage(codePoint, " cannot be mapped to upper case")); + } + return codePointFormatted; + } + + /** + * Transforms a Unicode code point into lower case using {@link java.lang.Character#toLowerCase} and confirms the + * result is lower case. + * + * @param codePoint the code point to lower case + * @return the lower case code point that corresponds to the input parameter + * @throws IllegalArgumentException if the converted code point cannot be mapped into a lower case character + */ + private static int toLowerCase(int codePoint) { + int codePointFormatted = Character.toLowerCase(codePoint); + if (!Character.isLowerCase(codePointFormatted)) { + throw new IllegalArgumentException(createExceptionMessage(codePoint, " cannot be mapped to lower case")); + } + return codePointFormatted; + } + + /** + * Creates an exception message that displays the Unicode character as well as the hex value for clarity. + * + * @param codePoint the Unicode code point to transform + * @param suffix a string suffix for the message + * @return the message + */ + private static String createExceptionMessage(int codePoint, String suffix) { + return "Character '" + new String(new int[] { codePoint }, 0, 1) + "' with value 0x" + Integer.toHexString(codePoint) + suffix; + } + +} diff --git a/src/main/java/org/apache/commons/text/cases/SnakeCase.java b/src/main/java/org/apache/commons/text/cases/SnakeCase.java new file mode 100644 index 0000000000..b6e1ae74d3 --- /dev/null +++ b/src/main/java/org/apache/commons/text/cases/SnakeCase.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.cases; + +/** + * Case implementation which parses and formats strings of the form 'my_snake_string' + *+ * SnakeCase is a delimited case where the delimiter is the underscore character '_'. + *
+ */ +public final class SnakeCase extends CharacterDelimitedCase { + + /** Constant for delimiter. */ + private static final char DELIMITER = '_'; + + /** Constant reusable instance of this case. */ + public static final SnakeCase INSTANCE = new SnakeCase(); + + /** + * Constructs a new SnakeCase instance. + */ + private SnakeCase() { + super(DELIMITER); + } + +} diff --git a/src/main/java/org/apache/commons/text/cases/UpperCaseDelimitedCase.java b/src/main/java/org/apache/commons/text/cases/UpperCaseDelimitedCase.java new file mode 100644 index 0000000000..940522fec5 --- /dev/null +++ b/src/main/java/org/apache/commons/text/cases/UpperCaseDelimitedCase.java @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.cases; + +import java.util.List; + +import org.apache.commons.text.StringTokenizer; +import org.apache.commons.text.TokenFormatterFactory; +import org.apache.commons.text.TokenStringifier; +import org.apache.commons.text.matcher.StringMatcherFactory; + + +/** + * Case implementation which parses and formats strings where tokens are delimited by upper case characters. + */ +public class UpperCaseDelimitedCase implements Case { + + /** + * The tokenizer. + */ + private StringTokenizer tokenizer; + + /** + * The stringifier. + */ + private TokenStringifier stringifier; + + /** + * Constructs a new UpperCaseDelimitedCase instance. + */ + UpperCaseDelimitedCase(boolean lowerCaseFirstCharacter) { + tokenizer = new StringTokenizer((String) null, StringMatcherFactory.INSTANCE.uppercaseMatcher()); + tokenizer.setOmitDelimiterMatches(false); + stringifier = new TokenStringifier(TokenFormatterFactory.emptyFormatter(), new PascalTokenFormatter(lowerCaseFirstCharacter)); + } + + /** + * Parses a string into tokens. + *+ * String characters are iterated over and when an upper case Unicode character is + * encountered, that character starts a new token, with the character + * itself included in the token. This method never returns empty tokens. + *
+ * + * @param string the string to parse + * @return the list of tokens found in the string + */ + @Override + public List+ * Iterates the tokens and formats each one into a token where the first character of the token + * is forced upper case in the output. The remaining characters of the token will be lower case + * or non cased. Conversions to lower case are attempted and any conversion that is not possible + * throws an exception. Any other characters in the token are returned as-is. Empty tokens are + * not supported and will cause an exception to be thrown. + *
+ * + * @param tokens the string tokens to be formatted + * @return the formatted string + * @throws IllegalArgumentException if 1) any token is empty 2) any token begins with a + * character that cannot be mapped to upper case, or 3) any token contains an upper or title case + * character that cannot be mapped to lower case. + */ + @Override + public String format(IterableProvides algorithms for parsing and formatting various programming "Cases".
+ *Two base classes are provided to hold functionality common to multiple cases:
+ * UpperCaseDelimitedCase - delimited by upper case characters.
+ * DelimitedCase - delimited by a constant character, which is omitted from parsed tokens.
+ * Four full implementations are provided for the most widely used cases:
+ * CamelCase - extension of UpperCaseDelimitedCase where first character must be lower case.
+ * PascalCase - extension of UpperCaseDelimitedCase where first character must be upper case.
+ * SnakeCase - extension of DelimitedCase in which the delimiter is an underscore '_'.
+ * KebabCase - extension of DelimitedCase in which the delimiter is a hyphen '-'.
+ *
+ * Thread=safe. + *
+ */ + static final class UppercaseMatcher extends AbstractStringMatcher { + + /** + * Constructs a new instance of {@code UppercaseMatcher}. + */ + UppercaseMatcher() { + } + + /** + * Returns {@code 1} if there is a match, or {@code 0} if there is no match. + * + * @param buffer the text content to match against, do not change + * @param start the starting position for the match, valid for buffer + * @param bufferStart unused + * @param bufferEnd unused + * @return the number of matching characters, zero for no match + */ + @Override + public int isMatch(char[] buffer, int start, int bufferStart, int bufferEnd) { + int codePoint = Character.codePointAt(buffer, start); + return Character.isUpperCase(codePoint) ? Character.charCount(codePoint) : 0; + } + + /** + * Returns {@code 1} if there is a match, or {@code 0} if there is no match. + * + * @param buffer the text content to match against, do not change + * @param start the starting position for the match, valid for buffer + * @param bufferStart unused + * @param bufferEnd unused + * @return The number of matching characters, zero for no match + */ + @Override + public int isMatch(final CharSequence buffer, final int start, final int bufferStart, final int bufferEnd) { + int codePoint = Character.codePointAt(buffer, start); + return Character.isUpperCase(codePoint) ? Character.charCount(codePoint) : 0; + } + + /** + * Returns 1. + */ + @Override + public int size() { + throw new UnsupportedOperationException("Uppercase Matcher doesn't support size() method"); + } + } + /** * Constructs a new instance. */ diff --git a/src/main/java/org/apache/commons/text/matcher/StringMatcherFactory.java b/src/main/java/org/apache/commons/text/matcher/StringMatcherFactory.java index c08b79553b..b6d16e1974 100644 --- a/src/main/java/org/apache/commons/text/matcher/StringMatcherFactory.java +++ b/src/main/java/org/apache/commons/text/matcher/StringMatcherFactory.java @@ -81,6 +81,11 @@ public final class StringMatcherFactory { */ private static final AbstractStringMatcher.TrimMatcher TRIM_MATCHER = new AbstractStringMatcher.TrimMatcher(); + /** + * Matches Unicode upper case characters. + */ + private static final AbstractStringMatcher.UppercaseMatcher UPPERCASE_MATCHER = new AbstractStringMatcher.UppercaseMatcher(); + /** * No need to build instances for now. */ @@ -255,4 +260,13 @@ public StringMatcher trimMatcher() { return TRIM_MATCHER; } + /** + * Matches Unicode uppercase characters. + * + * @return The upper case matcher + */ + public StringMatcher uppercaseMatcher() { + return UPPERCASE_MATCHER; + } + } diff --git a/src/test/java/org/apache/commons/text/StringTokenizerTest.java b/src/test/java/org/apache/commons/text/StringTokenizerTest.java index 2b46346d33..e8a9ab38c5 100644 --- a/src/test/java/org/apache/commons/text/StringTokenizerTest.java +++ b/src/test/java/org/apache/commons/text/StringTokenizerTest.java @@ -375,6 +375,40 @@ public void testBasicIgnoreTrimmed4() { assertFalse(tok.hasNext()); } + @Test + public void testOmitDelimiter1() { + final String input = "AbcDefGhi"; + final StringTokenizer tok = new StringTokenizer(input, StringMatcherFactory.INSTANCE.uppercaseMatcher()); + tok.setOmitDelimiterMatches(false); + assertEquals("Abc", tok.next()); + assertEquals("Def", tok.next()); + assertEquals("Ghi", tok.next()); + assertFalse(tok.hasNext()); + } + + @Test + public void testOmitDelimiter2() { + final String input = "Abc:Def:Ghi"; + final StringTokenizer tok = new StringTokenizer(input, ':'); + tok.setOmitDelimiterMatches(false); + assertEquals("Abc", tok.next()); + assertEquals(":Def", tok.next()); + assertEquals(":Ghi", tok.next()); + assertFalse(tok.hasNext()); + } + + @Test + public void testOmitDelimiter3() { + final String input = "Abc :Def :Ghi "; + final StringTokenizer tok = new StringTokenizer(input, ':'); + tok.setTrimmerMatcher(StringMatcherFactory.INSTANCE.trimMatcher()); + tok.setOmitDelimiterMatches(false); + assertEquals("Abc", tok.next()); + assertEquals(":Def", tok.next()); + assertEquals(":Ghi", tok.next()); + assertFalse(tok.hasNext()); + } + @Test public void testBasicQuoted1() { final String input = "a 'b' c"; diff --git a/src/test/java/org/apache/commons/text/TokenFormatterFactoryTest.java b/src/test/java/org/apache/commons/text/TokenFormatterFactoryTest.java new file mode 100644 index 0000000000..787e0f7c0c --- /dev/null +++ b/src/test/java/org/apache/commons/text/TokenFormatterFactoryTest.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; + +import org.apache.commons.text.TokenFormatterFactory.ConstantTokenFormatter; +import org.apache.commons.text.TokenFormatterFactory.NoOpFormatter; +import org.junit.jupiter.api.Test; + +public class TokenFormatterFactoryTest { + + @Test + public void testConstantTokenFormatterFailOnConstant() { + ConstantTokenFormatter formatter = TokenFormatterFactory.constantFormatter("abc".toCharArray(), true); + assertThrows(IllegalArgumentException.class, () -> formatter.format(null, 0, "dabc".toCharArray())); + assertThrows(IllegalArgumentException.class, () -> formatter.format(null, 0, "abc".toCharArray())); + assertThrows(IllegalArgumentException.class, () -> formatter.format(null, 0, "abcd".toCharArray())); + + ConstantTokenFormatter unicode = TokenFormatterFactory.constantFormatter("\uD801\uDC00".toCharArray(), true); + assertThrows(IllegalArgumentException.class, () -> unicode.format(null, 0, "\uD801\uDC00".toCharArray())); + assertThrows(IllegalArgumentException.class, () -> unicode.format(null, 0, "a\uD801\uDC00".toCharArray())); + assertThrows(IllegalArgumentException.class, () -> unicode.format(null, 0, "\uD801\uDC00b".toCharArray())); + } + + @Test + public void testConstantTokenFormatter() { + ConstantTokenFormatter formatter = TokenFormatterFactory.constantFormatter("abc".toCharArray(), false); + assertEquals("abc", formatter.format(null, 0, new char[0])); + assertEquals("abc", formatter.format(null, 0, "abc".toCharArray())); + assertEquals("abc", formatter.format(null, 0, "abdc".toCharArray())); + assertEquals("abc", formatter.format(null, 0, "".toCharArray())); + + formatter = TokenFormatterFactory.constantFormatter("\uD801\uDC00".toCharArray(), true); + assertEquals("\uD801\uDC00", formatter.format(null, 0, new char[0])); + assertEquals("\uD801\uDC00", formatter.format(null, 0, "abc".toCharArray())); + } + + @Test + public void testNoOpFormatter() { + NoOpFormatter formatter = TokenFormatterFactory.noOpFormatter(); + assertEquals("\uD801\uDC00", formatter.format(null, 0, "\uD801\uDC00".toCharArray())); + assertEquals("\uD801\uDC00a", formatter.format(null, 0, "\uD801\uDC00a".toCharArray())); + assertEquals("abc", formatter.format(null, 0, "abc".toCharArray())); + assertEquals("", formatter.format(null, 0, "".toCharArray())); + } + + @Test + public void testEmptyFormatter() { + ConstantTokenFormatter formatter = TokenFormatterFactory.emptyFormatter(); + assertEquals("", formatter.format(null, 0, "\uD801\uDC00".toCharArray())); + assertEquals("", formatter.format(null, 0, "\uD801\uDC00a".toCharArray())); + assertEquals("", formatter.format(null, 0, "abc".toCharArray())); + assertEquals("", formatter.format(null, 0, "".toCharArray())); + } +} diff --git a/src/test/java/org/apache/commons/text/TokenStringifierTest.java b/src/test/java/org/apache/commons/text/TokenStringifierTest.java new file mode 100644 index 0000000000..33d7dc8ae6 --- /dev/null +++ b/src/test/java/org/apache/commons/text/TokenStringifierTest.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.util.Arrays; +import java.util.List; + +import org.junit.jupiter.api.Test; + +public class TokenStringifierTest { + + @Test + public void testTokenStringifier() { + TokenStringifier stringifier = new TokenStringifier(TokenFormatterFactory.constantFormatter(',', true), TokenFormatterFactory.noOpFormatter()); + List