22
33using BitIntegers: @define_integers
44
5- import Base: unsafe_getindex, == , show , promote_rule
5+ import Base: unsafe_getindex, == , cmp , promote_rule
66using Base: @_inline_meta , @propagate_inbounds , @_propagate_inbounds_meta
77import Base. GC: @preserve
88
9+ """
10+ Type for holding short, fixed maximum size strings efficiently
11+ """
912struct ShortString{T} <: AbstractString where {T}
1013 size_content:: T
1114end
1215
13- """ Check if a string of size `sz` can be stored in ShortString{T}"""
14- function check_size (T, sz)
15- max_len = sizeof (T) - size_bytes (T) # the last few bytes are used to store the length
16- if sz > max_len
17- throw (ErrorException (" sizeof(::$T ) must be shorter than or equal to $(max_len) in length; you have supplied a string of size $sz " ))
18- end
19- end
16+ """ The size of the chunk used to process String values"""
17+ const CHUNKSZ = sizeof (UInt)
18+
19+ """ Mask used for alignment"""
20+ const CHUNKMSK = (CHUNKSZ- 1 )% UInt
21+
22+ """ The number of bits in the chunk type used to process String values"""
23+ const CHUNKBITS = sizeof (UInt) == 4 ? 32 : 64
2024
2125""" Calculate the number of bytes required to store the size of the ShortString"""
2226size_bytes (:: Type{T} ) where {T} = (count_ones (sizeof (T)- 1 )+ 7 )>> 3
2327
28+ """ Calculate the maximum length in bytes that can be stored in this ShortString"""
29+ max_len (T) = sizeof (T) - size_bytes (T)
30+
31+ """ Check if a string of size `sz` can be stored in ShortString{T}"""
32+ @inline function check_size (T, sz)
33+ maxlen = max_len (T)
34+ sz > maxlen &&
35+ throw (ErrorException (" sizeof(::$T ) must be shorter than or equal to $(maxlen) in length; you have supplied a string of size $sz " ))
36+ end
37+
2438""" Calculate a mask to get the size stored in the ShortString"""
2539size_mask (T) = T ((1 << (size_bytes (T)* 8 )) - 1 )
2640size_mask (s:: ShortString{T} ) where {T} = size_mask (T)
2741
28- """ The size of the chunk used to process String values"""
29- const CHUNKSZ = sizeof (UInt)
30-
31- """ The number of bits in the chunk type used to process String values"""
32- const CHUNKBITS = sizeof (UInt) == 4 ? 32 : 64
42+ """ Get the contents of the ShortString without the size, in native order"""
43+ _swapped_str (s:: ShortString ) = ntoh (s. size_content & ~ size_mask (s))
3344
3445""" Internal function to pick up a byte at the given index in a ShortString"""
3546@inline _get_byte (s:: ShortString , i:: Int ) = (s. size_content >>> (8 * (sizeof (s) - i)))% UInt8
@@ -105,8 +116,7 @@ function ShortString{T}(s::ShortString{S}) where {T, S}
105116 # size_mask(S) will return a mask for getting the size for Shorting Strings in (content size)
106117 # format, so something like 00001111 in binary.
107118 # ~size_mask(S) will yield 11110000 which can be used as a mask to extract the content
108- content = ntoh (T (ntoh (s. size_content & ~ size_mask (S))))
109- ShortString {T} (content | T (sz))
119+ ShortString {T} (ntoh (T (_swapped_str (s))) | T (sz))
110120end
111121
112122""" Amount to shift ShortString value by for each UInt sized chunk"""
@@ -129,16 +139,16 @@ function String(s::ShortString{T}) where {T}
129139end
130140
131141Base. codeunit (s:: ShortString ) = UInt8
132- Base. codeunit (s:: ShortString , i) = codeunits (String (s), i)
133- Base. codeunit (s:: ShortString , i:: Integer ) = codeunit (String (s), i)
134- Base. codeunits (s:: ShortString ) = codeunits (String (s))
142+ @inline function Base. codeunit (s:: ShortString , i:: Integer )
143+ @boundscheck checkbounds (s, i)
144+ _get_byte (s, i)
145+ end
135146
136147Base. convert (:: ShortString{T} , s:: String ) where {T} = ShortString {T} (s)
137148Base. convert (:: String , ss:: ShortString ) = String (ss)
138149
139150Base. sizeof (s:: ShortString ) = Int (s. size_content & size_mask (s))
140151
141- Base. firstindex (:: ShortString ) = 1
142152Base. lastindex (s:: ShortString ) = sizeof (s)
143153Base. ncodeunits (s:: ShortString ) = sizeof (s)
144154
162172
163173@inline function Base. isascii (s:: ShortString{T} ) where {T}
164174 val = s. size_content >>> (8 * size_bytes (T))
165- for i in 1 : ( sizeof (T) - size_bytes (T) )
175+ for i in 1 : max_len (T )
166176 iszero (val & 0x80 ) || return false
167177 val >>>= 8
168178 end
@@ -197,29 +207,59 @@ end
197207 reinterpret (Char, _get_char (str, pos))
198208end
199209
200- function == (s:: ShortString{S} , b:: Union{String, SubString{String}} ) where {S}
201- ncodeunits (b) == ncodeunits (s) || return false
202- return s == ShortString {S} (b)
210+ @inline _mask_bytes (n) = ((1 % UInt) << ((n & CHUNKMSK) << 3 )) - 0x1
211+
212+ # Optimized version of checking for equality against a string
213+ function == (a:: ShortString , b:: String )
214+ sz = sizeof (a)
215+ sizeof (b) == sz || return false
216+ sz == 0 || return true
217+ val = _swapped_str (a)
218+ @preserve b begin
219+ pnt = reinterpret (Ptr{UInt}, pointer (b))
220+ while sz >= sizeof (UInt)
221+ xor (val & typemax (UInt), unsafe_load (pnt)) == 0 || return false
222+ sz -= sizeof (UInt)
223+ val >>>= 8 * sizeof (UInt)
224+ pnt += CHUNKSZ
225+ end
226+ return sz === 0 || val == (unsafe_load (pnt) & _mask_bytes (sz))
227+ end
203228end
204- function == (s:: ShortString , b:: AbstractString )
205- # Could be a string type that might not use UTF8 encoding and that we don't have a
206- # constructor for. Defer to equality that type probably has defined on `String`
207- return String (s) == b
229+
230+ # This can be optimized to be much faster, like the code in StrBase.jl, doing 4 or 8 byte
231+ # chunks, as above, but it has to deal with alignment. Will add to a later PR
232+ function == (s:: ShortString , b:: SubString{String} )
233+ sz = sizeof (s)
234+ sizeof (b) == sz || return false
235+ sz == 0 || return true
236+ val = _swapped_str (s)
237+ @preserve s begin
238+ pnt = pointer (b)
239+ while (sz -= 1 ) >= 0
240+ unsafe_load (pnt) == (val & 0xff ) || return false
241+ pnt += 1
242+ val >>>= 8
243+ end
244+ end
245+ return true
208246end
209247
210248== (a:: AbstractString , b:: ShortString ) = b == a
211- function == (a:: ShortString{S} , b:: ShortString{S} ) where {S}
212- return a. size_content == b. size_content
213- end
214- function == (a:: ShortString{A} , b:: ShortString{B} ) where {A,B}
215- ncodeunits (a) == ncodeunits (b) || return false
216- # compare if equal after dropping size bits and
217- # flipping so that the empty bytes are at the start
218- ntoh (a. size_content & ~ size_mask (A)) == ntoh (b. size_content & ~ size_mask (B))
219- end
220249
221- function Base. cmp (a:: ShortString{S} , b:: ShortString{S} ) where {S}
222- return cmp (a. size_content, b. size_content)
250+ == (a:: ShortString{S} , b:: ShortString{S} ) where {S} = (a. size_content == b. size_content)
251+
252+ # compare if equal after dropping size bits and flipping so that the empty bytes are at the start
253+ == (a:: ShortString , b:: ShortString ) = sizeof (a) == sizeof (b) && _swapped_str (a) == _swapped_str (b)
254+
255+ cmp (a:: ShortString{S} , b:: ShortString{S} ) where {S} = cmp (a. size_content, b. size_content)
256+
257+ function cmp (a:: ShortString{S} , b:: ShortString{T} ) where {S,T}
258+ if sizeof (T) > sizeof (S)
259+ cmp (ntoh (T (_swapped_str (a))) | T (sizeof (a)), b. size_content)
260+ else
261+ cmp (a. size_content, ntoh (T (_swapped_str (b))) | T (sizeof (b)))
262+ end
223263end
224264
225265promote_rule (:: Type{String} , :: Type{ShortString{S}} ) where {S} = String
@@ -240,9 +280,9 @@ size_content(s::ShortString) = s.size_content
240280const def_types = (UInt32, UInt64, UInt128, UInt256, UInt512, UInt1024, UInt2048)
241281
242282for T in def_types
243- max_len = sizeof (T) - size_bytes (T)
244- constructor_name = Symbol (:ShortString , max_len )
245- macro_name = Symbol (:ss , max_len , :_str )
283+ maxlen = max_len (T)
284+ constructor_name = Symbol (:ShortString , maxlen )
285+ macro_name = Symbol (:ss , maxlen , :_str )
246286
247287 @eval const $ constructor_name = ShortString{$ T}
248288 @eval macro $ (macro_name)(s)
@@ -257,17 +297,39 @@ which can be used to store the string
257297If no type is large enough, then an `ArgumentError` is thrown
258298"""
259299function get_type (maxlen; types= def_types)
300+ maxlen < 1 && throw (ArgumentError (" $maxlen is <= 0" ))
260301 for T in types
261- maxlen <= sizeof (T) - size_bytes (T) && return ShortString{T}
302+ maxlen <= max_len (T) && return ShortString{T}
262303 end
263304 throw (ArgumentError (" $maxlen is too large to fit into any of the provided types: $types " ))
264305end
265306
307+ """
308+ Create a ShortString, using the smallest ShortString that can fit the string, unless the second
309+ argument `maxlen` is passed.
310+ If the keyword argument `types` is passed with a list (a tuple or Vector) of Unsigned
311+ types, in order of their size, then one of those types will be used.
312+ """
266313ShortString (str:: Union{String,SubString{String}} , maxlen = sizeof (str); types= def_types) =
267314 get_type (maxlen, types= types)(str)
268315
269- macro ss_str (str, max= " 0" )
270- :( ShortString ($ str, $ (parse (Int, max))) )
316+ """
317+ Create a ShortString, using the smallest ShortString that can fit the string,
318+ unless it is optionally followed by a single ASCII character and a maximum length.
319+ `ss"foo"b255` indicates that a ShortString that can contain 255 bytes should be used.
320+ """
321+ macro ss_str (str, max= nothing )
322+ if max === nothing
323+ maxlen = sizeof (str)
324+ elseif max isa Integer
325+ maxlen = max
326+ elseif max isa String
327+ maxlen = tryparse (Int, isdigit (max[1 ]) ? max : max[2 : end ])
328+ maxlen === nothing && throw (ArgumentError (" Optional length $max not a valid Integer" ))
329+ else
330+ throw (ArgumentError (" Unsupported type $(typeof (max)) for optional length $max " ))
331+ end
332+ :( ShortString ($ str, $ maxlen) )
271333end
272334
273335fsort (v:: Vector{ShortString{T}} ; rev = false ) where {T} =
0 commit comments