str_util.icn - source code

Source file str_util.icn
#<p>
#<b>General purpose string routines contributed by various people.</b>
#
#   This is one of several files contributing to the util package.
#</p>
#<p>
# <b>Authors:</b> Steve Wampler (<i>sbw@tapestry.tucson.az.us</i>)
#                 Robert Parlett (parlett@dial.pipex.com)
#    and others.
#</p>
#<p>
#  This file is in the <i>public domain</i>.
#</p>

package util

import lang

#
# Utility to get next integer
#
procedure get_int()
   suspend ::tab(::upto(&digits)) & ::integer(::tab(::many(&digits)))
end

#<p>
# Does string s end in suffix?
#   <[returns the prefix on a successful match]>
#   <[fails if suffix does not match]>
#</p>
procedure zapSuffix(s,      # String to examine
                    suffix  # Suffix to find and remove
                   )
    return (::string(s), ::string(suffix), s[-*suffix:0] == suffix, s[1:-*suffix])
end

#<p>
# Does string <b>s</b> start with <b>prefix</b>?
#   <[returns the suffix on a successful match]>
#   <[fails if prefix does not match]>
#</p>
procedure zapPrefix(s,      # String to examine
                    prefix  # Prefix to find and remove
                   )
    return (::string(s), ::string(prefix), s[1+:*prefix] == prefix, s[*prefix+1:0])
end

#<p>
# Strip suffix from string s.
#   <[returns <tt>s</tt> with <tt>suffix</tt> removed, or <tt>s</tt>
#      if <tt>suffix</tt> not found]>
#</p>
procedure delSuffix(s,      # String to examine
                    suffix  # Suffix to remove if present
                   )
    return zapSuffix(s, suffix) | ::string(s)
end

#<p>
# Strip prefix from string s.
#   <[returns <tt>s</tt> with <tt>prefix</tt> removed, or <tt>s</tt>
#      if <tt>prefix</tt> not found]>
#</p>
procedure delPrefix(s,      # String to examine
                    prefix  # Suffix to remove if present
                   )
    return zapPrefix(s, prefix) | ::string(s)
end

#<p>
# Trim all characters in cs from the left edge of s.
#    Same parameters and defaults as trim(s,cs).
#  <[returns <tt>s</tt> with prefix of characters in <tt>cs</tt> stripped]>
#</p>
procedure lTrim(s,      # String to examine
                cs      # Characters to strip from beginning of <tt>s</tt>.
               )
    return ::reverse(::trim(::reverse(s),cs))
end

#<p>
# <[generate the "words" in s.  By default, words are defined
# as sequences of 1 or more letters]>
#</p>
procedure genWords(s,   # String to examine
                   cs   # Characters comprising words (default: &letters)
                  )
    /cs := &letters
    s ? while &pos := ::upto(cs) do {
        suspend ::tab(::many(cs))\1
        }
end


#<p>
# <[generate the fields in s that are _separated_ by sequences of
#   <i>one or more</i> characters in cs.]>
#</p>
procedure genFields(s,  # String to examine
                    cs:' \t\n'
                   )
    local s1

    lTrim(s, cs) ? {
        while s1 := ::tab(::upto(cs)|0) do {
            suspend s1
            ::tab(::many(cs)) | fail
            }
        }
end

#<p>
# <[generate the fields in s that are _separated_ by exactly one
#   of the characters in cs (i.e. can have empty fields).
#   Leading characters in cs are treated as separating empty fields.]>
#</p>
procedure genFieldsOne(s,   # String to examine
                       cs:' \t\n'
                      )
    s ? {
        while s1 := ::tab(::upto(cs)|0) do {
            suspend s1
            ::move(1) | fail
            }
        }
end

#<p>
# Hide escaped instances of characters.  Often, string scanning
# can be simplified if some characters are 'hidden' whenever they
# are escaped. (For example, consider running <b>bal()</b> on <b>"(\))"</b>.)
# This procedure can be used to hide/unhide such escaped characters
# by converting them to other (typically in <b>&cset--&ascii</b>).
# <i>Unhiding is accomplished by switching the second and third arguments.</i>
#   <[returns <tt>s</tt> with characters in <tt>s1</tt> hidden by
#        replacing them with corresponding characters in <tt>s2</tt>]>
# </p><p>
# The second and third parameters behave as in <b>map()</b>, except
# that s1 defaults to <b>"("</b> and s2 defaults to a character string
# selected from <b>&cset--&ascii</b>.  <i>Note that accepting the
# defaults makes it more challenging to <b>unhide</b> the characters
# later!</i>
# </p><p>
# The fourth parameter defaults to a backslash.
# </p><p>
# Probably doesn't work right when trying to hide escapes...
# </p>
procedure hideEscapedChars(s,       # String to examine
                           s1:"(",  # Look for these characters
                           s2,      # ... and replace with these
                           esc:'\\' # Escape character
                          )
    local c1 := ::cset(s1), ns := ""

    /s2 := hideAllChars(s1)

    s ? {
        while ns ||:= ::tab(::upto(c1)) do {
            if isEscapeSeq(ns, esc) then {
                ns ||:= ::map(::move(1), s1, s2)
                }
            else {
                ns ||:= ::move(1)
                }
            }
        ns ||:= ::tab(0)
        }

    return ns
end

#<p>
# Shift all the characters in a string into characters outside of <b>&ascii</b>.
# This procedure is intended as aid when using <b>hideEscapedChars()</b> by
# making it easier to construct the third argument, as in:
# <pre>
#    s := hideEscapedChars(s, s1, hideAllChars(s1))
# </pre>
#  <[returns string with all characters mapped out of <b>&ascii</b>]>
#</p>
procedure hideAllChars(s    # String of characters to hide.
                      )
    return ::map(s, &ascii, &cset--&ascii)
end

#<p>
# Given a string and a table of strings (keys and entries), replaces
#   occurrences of the table key values with the corresponding entry values.
#   <i>Finds longest sequence matching a key value.</i>
#  <[returns copy of <tt>s</tt> with replaced substrings]>
#</p>
#<p>
# <tt>replaceStrs</tt> is most useful in simple cases where you have only
# a few replacements to make on comparatively short text.  For large numbers
# of replacements across a lot of text, the <tt>StringReplacer</tt> class
# is likely to faster.
#</p>
procedure replaceStrs(s,    # String to examine
                      tbl   # Table of replacement strings.
                            #   Keys are substrings to locate,
                            #   entries are corresponding replacements
                     )
    local a, ns := ""
    every ::put(a := [], ::key(tbl))
    a := ::reverse(::sort(a))
    s ? {
        while ns ||:= ::tab(findFirst(a)) do {
            ns ||:= tbl[=!a]
            }
        ns ||:= ::tab(0)
        }
    return ns
end

#<p>
# Given two strings, produces the longest prefix they share in common.
#   <[returns longest common prefix of <tt>s1</tt> and <tt>s2</tt>]>
#</p>
procedure lcp(s1,s2)
   return s2 ? =s1[1+:*(if *s1 > *s2 then s2 else s1) to 0 by -1]
end

#<p>
# Given two strings, produces the longest suffix they share in common.
#   <[returns longest common suffix of <tt>s1</tt> and <tt>s2</tt>]>
#</p>
procedure lcs(s1,s2)
   return ::reverse(lcp(::reverse(s1),::reverse(s2)))
end

#<p>
# <b><i>Deprecated in favor of the MapBytes class</i></b>
#</p>
#<p>
# Do byte reorderings efficiently.
#   Assumes that the length of string <b>s</b> is a multiple of
#   the length of <b>in</b>.  If this isn't the case, the remaining
#   characters in s are appended unmapped.
#<p>
# <i>The default action is the identity mapping</i>
#</p>
#<p>
# To work, <b>in</b> and <b>out</b> must be the same length.
#</p>
procedure mapBytes(s,   # String to re-arrange
                   in,  # Input byte order,  e.g. "1234"
                   out  # Output byte order, e.g.: "4321"
                  )
    local  ns := ""
    static inMap, outMap, saveIn, saveOut

    if /in then {       # default to identity mapping
        in  := "1"
        out := "1"
        }

    # bootstrap...
    if (/saveIn  | (saveIn  ~==  in)) |
       (/saveOut | (saveOut ~== out)) then {
        saveIn  := in
        saveOut := out
        inMap   := ::string(&cset)
        inMap   := inMap[1+:((*inMap/*in)*(*in))]
        outMap  := ""
        inMap ? while(outMap ||:= ::map(in,out,::move(*in)))
        }

    # now do the work
    s ? {
        while ns ||:= ::map(inMap,outMap,::move(*inMap))
        while ns ||:= ::map(in,out,::move(*in))  # catch short strings
        ns ||:= ::tab(0)                       # just append last tidbit
        }
    return ns
end

#<p>
#   Convert of string with (possibly) embedded separators into a list.
#  <[param s string to convert into list of strings]>
#  <[param sep character to break string on (defaults to newline)]>
#  <[returns list of strings from <tt>s</tt>]>
#</p>
procedure stringToList(s, sep:'\n')
    local a := []

    if s[0] == sep then s := s[1:-1]       # strip only last sep

    s ? {
        while ::put(a, ::tab(::upto(sep)|0)) do ::move(1) | break
        }

    return a
end

#<p>
#   Convert a list of strings into a single string.
#  <[param a list to join elements from into a string]>
#  <[param sep character to join elements on (defaults to newline)]>
#  <[returns string formed by joining elements in <tt>a</tt>.
#</p>
procedure listToString(a, sep:"\n")
   local s := ""

   every s ||:= !a || sep
   return s
end

#<p>
#  Pad a string on the left if it is too short.
#  This method is similar to the function <tt>left</tt> except that
#  <tt>lPad</tt> returns the original string if it is longer than
#  the specified field, while <tt>left</tt> truncates the string to
#  fit in the field.
#  <[param s string to pad]>
#  <[param n desired field width.  The actual width will be max(*s, n)]>
#  <[param p string to use as padding]>
#</p>
procedure lPad(s,n,p)
   return if *s < n then ::left(s,n,p) else s
end

#<p>
#  Pad a string on the right if it is too short.
#  This method is similar to the function <tt>right</tt> except that
#  <tt>rPad</tt> returns the original string if it is longer than
#  the specified field, while <tt>right</tt> truncates the string to
#  fit in the field.
#  <[param s string to pad]>
#  <[param n desired field width.  The actual width will be max(*s, n)]>
#  <[param p string to use as padding]>
#</p>
procedure rPad(s,n,p)
   return if *s < n then ::right(s,n,p) else s
end

#<p>
# Generate the fields from a CSV-encoded string.  Fails on any field that
#  isn't well-formed:<br><br>
#  - fields with doubles quotes or embedded separators must be enclosed
#      in double quotes<br>
#  - double quotes in fields must be doubled: e.g. to embed "five", use
#      ""five""<br>
#  - leading and trailing whitespace (blanks and tabs) is stripped unless
#    embedded inside a double-quoted field<br>
#  - if blank and/or tab are used as separators, they are not considered
#    whitespace<br>
#
# Also, any characters except double quotes and newlines may be used
#  as separators.
# <[generate the fields in CSV-formatted string <tt>s</tt>]>
# <[param s CSV string to parse]>
# <[param sep set of characters (default <tt>','</tt>) to separate fields]>
# <[param spanFlag if non-null, indicates that fields are separated by one or
#                           more separators instead of just one]>
#</p>
procedure genCSV(s, sep, spanFlag)
    local WS, r, spanner
    static span, nospan
    initial {
        # Default to not treating spans of separators as single separators
        nospan := makeProc { repeat { @&source; ::move(1) } }
        span   := makeProc { repeat { sep := @&source; ::tab(::many(\sep[1])) } }
        }

    spanner := if /spanFlag then nospan else span

    /sep := ","
    WS := ' \t\n' -- sep  # Can't include separators in whitespace
                          #  (The '\n' is a trick so there's always
                          #  something at the end of the last field.)

    (s||WS) ? {
        while not ::pos(0) do {
            ::tab(::many(WS))
            if ="\"" then {    # Quoted string
                r := ""
                while r ||:= tabSkip("\"") do {
                    if not ="\"" then break     # End of quoted string
                    r ||:= "\""
                    }
                suspend (::tab((::upto(sep))|0)\1, r) do spanner(sep)
                }
            else suspend ::trim(::tab((::upto(sep))|0)\1,WS) do spanner(sep)
            }
        }
end

#<p>
#  Produce a list of the fields from a CSV-formatted string.
# <[returns a list of the fields in the string <tt>s</tt>]>
# <[param s CSV string to parse]>
# <[param sep set of characters (default <tt>','</tt>) to separate fields]>
# <[param spanFlag if non-null, indicates that fields are separated by one or
#                           more separators instead of just one]>
#</p>
procedure parseCSV(s, sep, spanFlag)
    local A
    every ::put(A := [], genCSV(s, sep, spanFlag))
    return A
end

#<p>
# Produce a CVS-formatted string from a list of field values
# <[returns a CSV-formatted string for the list of fields]>
# <[param A list to put into CSV-formatted string]>
# <[param sep character used to separate fields (default <tt>','</tt>)]>
#</p>
procedure encodeCSV(A, sep)
    local r
    /sep := ","
    every (r := "") ||:= encodeCsvField(!A) || sep
    r := r[1:-1]
    return r
end

#<p>
# Put a single field into legal CSV form.
# <[returns the field in legal CSV format]>
# <[param s field to format]>
#</p>
procedure encodeCsvField(field)
    local r
    r := ""
    field ? {
        while r ||:= ::tab(::upto('"')+1) do r ||:= "\""
        r := "\"" || r || ::tab(0) || "\""
        }
    return r
end
This page produced by UniDoc on 2021/04/15 @ 23:59:44.