#include "unicode/utypes.h"
#include "unicode/uchar.h"
Go to the source code of this file.
Data Structures | |
struct | USerializedSet |
A serialized form of a Unicode set. More... | |
Typedefs | |
typedef USet | USet |
A UnicodeSet. | |
typedef enum USetSpanCondition | USetSpanCondition |
Argument values for whether span() and similar functions continue while the current character is contained vs. | |
typedef USerializedSet | USerializedSet |
A serialized form of a Unicode set. | |
Enumerations | |
enum | { USET_IGNORE_SPACE = 1, USET_CASE_INSENSITIVE = 2, USET_ADD_CASE_MAPPINGS = 4, USET_SERIALIZED_STATIC_ARRAY_CAPACITY = 8 } |
Bitmask values to be passed to uset_openPatternOptions() or uset_applyPattern() taking an option parameter. More... | |
enum | USetSpanCondition { USET_SPAN_NOT_CONTAINED = 0, USET_SPAN_CONTAINED = 1, USET_SPAN_SIMPLE = 2, USET_SPAN_CONDITION_COUNT } |
Argument values for whether span() and similar functions continue while the current character is contained vs. More... | |
Functions | |
USet * | uset_open (UChar32 start, UChar32 end) |
Creates a USet object that contains the range of characters start..end, inclusive. | |
USet * | uset_openPattern (const UChar *pattern, int32_t patternLength, UErrorCode *ec) |
Creates a set from the given pattern. | |
USet * | uset_openPatternOptions (const UChar *pattern, int32_t patternLength, uint32_t options, UErrorCode *ec) |
Creates a set from the given pattern. | |
void | uset_close (USet *set) |
Disposes of the storage used by a USet object. | |
USet * | uset_clone (const USet *set) |
Returns a copy of this object. | |
UBool | uset_isFrozen (const USet *set) |
Determines whether the set has been frozen (made immutable) or not. | |
void | uset_freeze (USet *set) |
Freeze the set (make it immutable). | |
USet * | uset_cloneAsThawed (const USet *set) |
Clone the set and make the clone mutable. | |
void | uset_set (USet *set, UChar32 start, UChar32 end) |
Causes the USet object to represent the range start - end . | |
int32_t | uset_applyPattern (USet *set, const UChar *pattern, int32_t patternLength, uint32_t options, UErrorCode *status) |
Modifies the set to represent the set specified by the given pattern. | |
void | uset_applyIntPropertyValue (USet *set, UProperty prop, int32_t value, UErrorCode *ec) |
Modifies the set to contain those code points which have the given value for the given binary or enumerated property, as returned by u_getIntPropertyValue. | |
void | uset_applyPropertyAlias (USet *set, const UChar *prop, int32_t propLength, const UChar *value, int32_t valueLength, UErrorCode *ec) |
Modifies the set to contain those code points which have the given value for the given property. | |
UBool | uset_resemblesPattern (const UChar *pattern, int32_t patternLength, int32_t pos) |
Return true if the given position, in the given pattern, appears to be the start of a UnicodeSet pattern. | |
int32_t | uset_toPattern (const USet *set, UChar *result, int32_t resultCapacity, UBool escapeUnprintable, UErrorCode *ec) |
Returns a string representation of this set. | |
void | uset_add (USet *set, UChar32 c) |
Adds the given character to the given USet. | |
void | uset_addAll (USet *set, const USet *additionalSet) |
Adds all of the elements in the specified set to this set if they're not already present. | |
void | uset_addRange (USet *set, UChar32 start, UChar32 end) |
Adds the given range of characters to the given USet. | |
void | uset_addString (USet *set, const UChar *str, int32_t strLen) |
Adds the given string to the given USet. | |
void | uset_addAllCodePoints (USet *set, const UChar *str, int32_t strLen) |
Adds each of the characters in this string to the set. | |
void | uset_remove (USet *set, UChar32 c) |
Removes the given character from the given USet. | |
void | uset_removeRange (USet *set, UChar32 start, UChar32 end) |
Removes the given range of characters from the given USet. | |
void | uset_removeString (USet *set, const UChar *str, int32_t strLen) |
Removes the given string to the given USet. | |
void | uset_removeAll (USet *set, const USet *removeSet) |
Removes from this set all of its elements that are contained in the specified set. | |
void | uset_retain (USet *set, UChar32 start, UChar32 end) |
Retain only the elements in this set that are contained in the specified range. | |
void | uset_retainAll (USet *set, const USet *retain) |
Retains only the elements in this set that are contained in the specified set. | |
void | uset_compact (USet *set) |
Reallocate this objects internal structures to take up the least possible space, without changing this object's value. | |
void | uset_complement (USet *set) |
Inverts this set. | |
void | uset_complementAll (USet *set, const USet *complement) |
Complements in this set all elements contained in the specified set. | |
void | uset_clear (USet *set) |
Removes all of the elements from this set. | |
UBool | uset_isEmpty (const USet *set) |
Returns TRUE if the given USet contains no characters and no strings. | |
UBool | uset_contains (const USet *set, UChar32 c) |
Returns TRUE if the given USet contains the given character. | |
UBool | uset_containsRange (const USet *set, UChar32 start, UChar32 end) |
Returns TRUE if the given USet contains all characters c where start <= c && c <= end. | |
UBool | uset_containsString (const USet *set, const UChar *str, int32_t strLen) |
Returns TRUE if the given USet contains the given string. | |
int32_t | uset_indexOf (const USet *set, UChar32 c) |
Returns the index of the given character within this set, where the set is ordered by ascending code point. | |
UChar32 | uset_charAt (const USet *set, int32_t index) |
Returns the character at the given index within this set, where the set is ordered by ascending code point. | |
int32_t | uset_size (const USet *set) |
Returns the number of characters and strings contained in the given USet. | |
int32_t | uset_getItemCount (const USet *set) |
Returns the number of items in this set. | |
int32_t | uset_getItem (const USet *set, int32_t itemIndex, UChar32 *start, UChar32 *end, UChar *str, int32_t strCapacity, UErrorCode *ec) |
Returns an item of this set. | |
UBool | uset_containsAll (const USet *set1, const USet *set2) |
Returns true if set1 contains all the characters and strings of set2. | |
UBool | uset_containsAllCodePoints (const USet *set, const UChar *str, int32_t strLen) |
Returns true if this set contains all the characters of the given string. | |
UBool | uset_containsNone (const USet *set1, const USet *set2) |
Returns true if set1 contains none of the characters and strings of set2. | |
UBool | uset_containsSome (const USet *set1, const USet *set2) |
Returns true if set1 contains some of the characters and strings of set2. | |
int32_t | uset_span (const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition) |
Returns the length of the initial substring of the input string which consists only of characters and strings that are contained in this set (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), or only of characters and strings that are not contained in this set (USET_SPAN_NOT_CONTAINED). | |
int32_t | uset_spanBack (const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition) |
Returns the start of the trailing substring of the input string which consists only of characters and strings that are contained in this set (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), or only of characters and strings that are not contained in this set (USET_SPAN_NOT_CONTAINED). | |
int32_t | uset_spanUTF8 (const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition) |
Returns the length of the initial substring of the input string which consists only of characters and strings that are contained in this set (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), or only of characters and strings that are not contained in this set (USET_SPAN_NOT_CONTAINED). | |
int32_t | uset_spanBackUTF8 (const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition) |
Returns the start of the trailing substring of the input string which consists only of characters and strings that are contained in this set (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), or only of characters and strings that are not contained in this set (USET_SPAN_NOT_CONTAINED). | |
UBool | uset_equals (const USet *set1, const USet *set2) |
Returns true if set1 contains all of the characters and strings of set2, and vis versa. | |
int32_t | uset_serialize (const USet *set, uint16_t *dest, int32_t destCapacity, UErrorCode *pErrorCode) |
Serializes this set into an array of 16-bit integers. | |
UBool | uset_getSerializedSet (USerializedSet *fillSet, const uint16_t *src, int32_t srcLength) |
Given a serialized array, fill in the given serialized set object. | |
void | uset_setSerializedToOne (USerializedSet *fillSet, UChar32 c) |
Set the USerializedSet to contain the given character (and nothing else). | |
UBool | uset_serializedContains (const USerializedSet *set, UChar32 c) |
Returns TRUE if the given USerializedSet contains the given character. | |
int32_t | uset_getSerializedRangeCount (const USerializedSet *set) |
Returns the number of disjoint ranges of characters contained in the given serialized set. | |
UBool | uset_getSerializedRange (const USerializedSet *set, int32_t rangeIndex, UChar32 *pStart, UChar32 *pEnd) |
Returns a range of characters contained in the given serialized set. |
This is a C wrapper around the C++ UnicodeSet class.
Definition in file uset.h.
|
A serialized form of a Unicode set. Limited manipulations are possible directly on a serialized set. See below.
|
|
A UnicodeSet. Use the uset_* API to manipulate. Create with uset_open*, and destroy with uset_close.
|
|
Argument values for whether span() and similar functions continue while the current character is contained vs. not contained in the set. The functionality is straightforward for sets with only single code points, without strings (which is the common case):
When a set contains multi-code point strings, then these statements may not be true, depending on the strings in the set (for example, whether they overlap with each other) and the string that is processed. For a set with strings:
Note: If it is important to get the same boundaries whether iterating forward or backward through a string, then either only span() should be used and the boundaries cached for backward operation, or an ICU BreakIterator could be used. Note: Unpaired surrogates are treated like surrogate code points. Similarly, set strings match only on code point boundaries, never in the middle of a surrogate pair. Illegal UTF-8 sequences are treated like U+FFFD. When processing UTF-8 strings, malformed set strings (strings with unpaired surrogates which cannot be converted to UTF-8) are ignored.
|
|
Bitmask values to be passed to uset_openPatternOptions() or uset_applyPattern() taking an option parameter.
|
|
Argument values for whether span() and similar functions continue while the current character is contained vs. not contained in the set. The functionality is straightforward for sets with only single code points, without strings (which is the common case):
When a set contains multi-code point strings, then these statements may not be true, depending on the strings in the set (for example, whether they overlap with each other) and the string that is processed. For a set with strings:
Note: If it is important to get the same boundaries whether iterating forward or backward through a string, then either only span() should be used and the boundaries cached for backward operation, or an ICU BreakIterator could be used. Note: Unpaired surrogates are treated like surrogate code points. Similarly, set strings match only on code point boundaries, never in the middle of a surrogate pair. Illegal UTF-8 sequences are treated like U+FFFD. When processing UTF-8 strings, malformed set strings (strings with unpaired surrogates which cannot be converted to UTF-8) are ignored.
|
|
Adds the given character to the given USet. After this call, uset_contains(set, c) will return TRUE. A frozen set will not be modified.
|
|
Adds all of the elements in the specified set to this set if they're not already present. This operation effectively modifies this set so that its value is the union of the two sets. The behavior of this operation is unspecified if the specified collection is modified while the operation is in progress. A frozen set will not be modified.
|
|
Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"} If this set already any particular character, it has no effect on that character. A frozen set will not be modified.
|
|
Adds the given range of characters to the given USet. After this call, uset_contains(set, start, end) will return TRUE. A frozen set will not be modified.
|
|
Adds the given string to the given USet. After this call, uset_containsString(set, str, strLen) will return TRUE. A frozen set will not be modified.
|
|
Modifies the set to contain those code points which have the given value for the given binary or enumerated property, as returned by u_getIntPropertyValue. Prior contents of this set are lost. A frozen set will not be modified.
|
|
Modifies the set to represent the set specified by the given pattern. See the UnicodeSet class description for the syntax of the pattern language. See also the User Guide chapter about UnicodeSet. Empties the set passed before applying the pattern. A frozen set will not be modified.
|
|
Modifies the set to contain those code points which have the given value for the given property. Prior contents of this set are lost. A frozen set will not be modified.
|
|
Returns the character at the given index within this set, where the set is ordered by ascending code point.
If the index is out of range, return (UChar32)-1. The inverse of this method is
|
|
Removes all of the elements from this set. This set will be empty after this call returns. A frozen set will not be modified.
|
|
Returns a copy of this object. If this set is frozen, then the clone will be frozen as well. Use uset_cloneAsThawed() for a mutable clone of a frozen set.
|
|
Clone the set and make the clone mutable. See the ICU4J Freezable interface for details.
|
|
Disposes of the storage used by a USet object. This function should be called exactly once for objects returned by uset_open().
|
|
Reallocate this objects internal structures to take up the least possible space, without changing this object's value. A frozen set will not be modified.
|
|
Inverts this set. This operation modifies this set so that its value is its complement. This operation does not affect the multicharacter strings, if any. A frozen set will not be modified.
|
|
Complements in this set all elements contained in the specified set. Any character in the other set will be removed if it is in this set, or will be added if it is not in this set. A frozen set will not be modified.
|
|
Returns TRUE if the given USet contains the given character. This function works faster with a frozen set.
|
|
Returns true if set1 contains all the characters and strings of set2. It answers the question, 'Is set1 a superset of set2?'
|
|
Returns true if this set contains all the characters of the given string. This is does not check containment of grapheme clusters, like uset_containsString.
|
|
Returns true if set1 contains none of the characters and strings of set2. It answers the question, 'Is set1 a disjoint set of set2?'
|
|
Returns TRUE if the given USet contains all characters c where start <= c && c <= end.
|
|
Returns true if set1 contains some of the characters and strings of set2. It answers the question, 'Does set1 and set2 have an intersection?'
|
|
Returns TRUE if the given USet contains the given string.
|
|
Returns true if set1 contains all of the characters and strings of set2, and vis versa. It answers the question, 'Is set1 equal to set2?'
|
|
Freeze the set (make it immutable). Once frozen, it cannot be unfrozen and is therefore thread-safe until it is deleted. See the ICU4J Freezable interface for details. Freezing the set may also make some operations faster, for example uset_contains() and uset_span(). A frozen set will not be modified. (It remains frozen.)
|
|
Returns an item of this set. An item is either a range of characters or a single multicharacter string.
|
|
Returns the number of items in this set. An item is either a range of characters or a single multicharacter string.
|
|
Returns a range of characters contained in the given serialized set.
|
|
Returns the number of disjoint ranges of characters contained in the given serialized set. Ignores any strings contained in the set.
|
|
Given a serialized array, fill in the given serialized set object.
|
|
Returns the index of the given character within this set, where the set is ordered by ascending code point.
If the character is not in this set, return -1. The inverse of this method is
|
|
Returns TRUE if the given USet contains no characters and no strings.
|
|
Determines whether the set has been frozen (made immutable) or not. See the ICU4J Freezable interface for details.
|
|
Creates a USet object that contains the range of characters start..end, inclusive.
If
|
|
Creates a set from the given pattern. See the UnicodeSet class description for the syntax of the pattern language.
|
|
Creates a set from the given pattern. See the UnicodeSet class description for the syntax of the pattern language.
|
|
Removes the given character from the given USet. After this call, uset_contains(set, c) will return FALSE. A frozen set will not be modified.
|
|
Removes from this set all of its elements that are contained in the specified set. This operation effectively modifies this set so that its value is the asymmetric set difference of the two sets. A frozen set will not be modified.
|
|
Removes the given range of characters from the given USet. After this call, uset_contains(set, start, end) will return FALSE. A frozen set will not be modified.
|
|
Removes the given string to the given USet. After this call, uset_containsString(set, str, strLen) will return FALSE. A frozen set will not be modified.
|
|
Return true if the given position, in the given pattern, appears to be the start of a UnicodeSet pattern.
|
|
Retain only the elements in this set that are contained in the specified range.
If
|
|
Retains only the elements in this set that are contained in the specified set. In other words, removes from this set all of its elements that are not contained in the specified set. This operation effectively modifies this set so that its value is the intersection of the two sets. A frozen set will not be modified.
|
|
Serializes this set into an array of 16-bit integers. Serialization (currently) only records the characters in the set; multicharacter strings are ignored. The array has following format (each line is one 16-bit integer): length = (n+2*m) | (m!=0?0x8000:0) bmpLength = n; present if m!=0 bmp[0] bmp[1] ... bmp[n-1] supp-high[0] supp-low[0] supp-high[1] supp-low[1] ... supp-high[m-1] supp-low[m-1] The array starts with a header. After the header are n bmp code points, then m supplementary code points. Either n or m or both may be zero. n+2*m is always <= 0x7FFF. If there are no supplementary characters (if m==0) then the header is one 16-bit integer, 'length', with value n. If there are supplementary characters (if m!=0) then the header is two 16-bit integers. The first, 'length', has value (n+2*m)|0x8000. The second, 'bmpLength', has value n. After the header the code points are stored in ascending order. Supplementary code points are stored as most significant 16 bits followed by least significant 16 bits.
|
|
Returns TRUE if the given USerializedSet contains the given character.
|
|
Causes the USet object to represent the range
If
|
|
Set the USerializedSet to contain the given character (and nothing else).
|
|
Returns the number of characters and strings contained in the given USet.
|
|
Returns the length of the initial substring of the input string which consists only of characters and strings that are contained in this set (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), or only of characters and strings that are not contained in this set (USET_SPAN_NOT_CONTAINED). See USetSpanCondition for details. Similar to the strspn() C library function. Unpaired surrogates are treated according to contains() of their surrogate code points. This function works faster with a frozen set and with a non-negative string length argument.
|
|
Returns the start of the trailing substring of the input string which consists only of characters and strings that are contained in this set (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), or only of characters and strings that are not contained in this set (USET_SPAN_NOT_CONTAINED). See USetSpanCondition for details. Unpaired surrogates are treated according to contains() of their surrogate code points. This function works faster with a frozen set and with a non-negative string length argument.
|
|
Returns the start of the trailing substring of the input string which consists only of characters and strings that are contained in this set (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), or only of characters and strings that are not contained in this set (USET_SPAN_NOT_CONTAINED). See USetSpanCondition for details. Malformed byte sequences are treated according to contains(0xfffd). This function works faster with a frozen set and with a non-negative string length argument.
|
|
Returns the length of the initial substring of the input string which consists only of characters and strings that are contained in this set (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), or only of characters and strings that are not contained in this set (USET_SPAN_NOT_CONTAINED). See USetSpanCondition for details. Similar to the strspn() C library function. Malformed byte sequences are treated according to contains(0xfffd). This function works faster with a frozen set and with a non-negative string length argument.
|
|
Returns a string representation of this set. If the result of calling this function is passed to a uset_openPattern(), it will produce another set that is equal to this one.
|