############################################################################
#
# File: scan.icn
#
# Subject: Procedures related to scanning
#
# Author: Richard L. Goerwitz, David A. Gamey, and Ralph E. Griswold
#
# Date: May 2, 2001
#
############################################################################
#
# This file is in the public domain.
#
############################################################################
#
# Contributors: Randal L. Schwartz and Cheyenne Wills
#
############################################################################
#
# This module contains procedures related to string scanning:
#
# balq(c1, c2, c3, c4, c5, s, i1, i2)
# like bal() with quoting from characters in c5.
#
# balqc(c1, c2, c3, c4, c5, s1, s2, s3, i1, i2)
# like balq() with the addition that balanced characters within
# "comments", as delimited by the strings s1 and s2, are also
# excluded from balancing. In addition, if s1 is given and s2
#
# limatch(L, c)
# matches items in list L delimited by characters in c
#
# slashbal(c1, c2, c3, s, i, j)
# like bal() with escape processing
#
# slashupto(c, s, i, j)
# like upto() with escape processing
#
# slshupto()
# synonym for slashupto()
#
# snapshot(title, len)
# snapshot of string scanning with optional title and
# maximum length.
#
# More extensive documentation proceeds each procedure.
#
############################################################################
#
# Richard L. Goerwitz:
#
# I am often frustrated at bal()'s inability to deal elegantly with
# the common \backslash escaping convention (a way of telling Unix
# Bourne and C shells, for instance, not to interpret a given
# character as a "metacharacter"). I recognize that bal()'s generic
# behavior is a must, and so I wrote slashbal() to fill the gap.
#
# Slashbal behaves like bal, except that it ignores, for purposes of
# balancing, any c2/c3 char which is preceded by a backslash. Note
# that we are talking about internally represented backslashes, and
# not necessarily the backslashes used in Icon string literals. If
# you have "\(" in your source code, the string produced will have no
# backslash. To get this effect, you would need to write "\\(."
#
# BUGS: Note that, like bal() (v8), slashbal() cannot correctly
# handle cases where c2 and c3 intersect. Note also that older ver-
# sions of this routine counted from the beginning of the string,
# instead of from i. This feature came to be regarded as a bug when
# put into actual use (especially when I realized that bal() doesn't
# work this way).
#
############################################################################
procedure slashbal(c1, c2, c3, s, i, j) #: bal() with escapes
local twocs, allcs, default_val, POS, chr, chr2, count
/c1 := &cset
/c2 := '('
/c3 := ')'
twocs := c2 ++ c3
allcs := c1 ++ c2 ++ c3 ++ '\\'
if /s := &subject
then default_val := &pos
else default_val := 1
if \i then {
if i < 1 then
i := *s + (i+1)
}
else i := default_val
if \j then {
if j < 1 then
j := *s + (j+1)
}
else j := *s + 1
count := 0; POS := i - 1
s[i:j] ? {
while tab(upto(allcs)) do {
chr := move(1)
if chr == "\\" & any(twocs) then {
chr2 := move(1)
if any(c1, chr) & count = 0 then
suspend POS + .&pos - 2
if any(c1, chr2) & count = 0 then
suspend POS + .&pos - 1
}
else {
if any(c1, chr) & count = 0 then
suspend POS + .&pos - 1
if any(c2, chr) then
count +:= 1
else if any(c3, chr) & count > 0 then
count -:= 1
}
}
}
end
############################################################################
#
# Richard L. Goerwitz:
#
# Slshupto works just like upto, except that it ignores backslash
# escaped characters. I can't even begin to express how often I've
# run into problems applying Icon's string scanning facilities to
# input that uses backslash escaping. Normally, I tokenize first
# and then work with lists. With slshupto(), I can now postpone or
# even eliminate the traditional tokenizing step and let Icon's
# string scanning facilities to do more of the work.
#
# If you're confused:
#
# Typically UNIX utilities (and probably others) use backslashes to
# "escape" (i.e. remove the special meaning of) metacharacters. For
# instance, UNIX shells normally accept "*" as a shorthand for "any
# series of zero or more characters. You can make the "*" a literal
# "*" with no special meaning by prepending a backslash. The routine
# slshupto() understands these backslashing conventions. You
# can use it to find the "*" and other special characters because it
# will ignore "escaped" characters.
#
############################################################################
# for compatibility with the original name
#
procedure slashupto(c, s, i, j) #: upto() with escapes
suspend slshupto(c, s, i, j)
end
#
# slshupto: cset x string x integer x integer -> integers
# (c, s, i, j) -> I's (a generator)
# where I's are the integer positions in s[i:j] before characters
# in c that is not preceded by a backslash escape
#
procedure slshupto(c, s, i, j) #: upto() with escapes
local c2
if /s := &subject
then /i := &pos
else /i := 1
/j := *s + 1
/c := &cset
c2 := '\\' ++ c
s[1:j] ? {
tab(i)
while tab(upto(c2)) do {
if ="\\" then {
move(1) | {
if find("\\", c)
then return &pos - 1
}
next
}
suspend .&pos
move(1)
}
}
end
############################################################################
#
# The procedure snapshot(title,len) writes a snapshot of the state
# of string scanning, showing the value of &subject and &pos, an
# optional title, and (again optionally) wrapping the display
# for len width.
#
# For example,
#
# "((a+b)-delta)/(c*d))" ? {
# tab(bal('+-/*'))
# snapshot("example")
# }
#
# produces
#
# ---example---------------------------
# | |
# | |
# | &subject = "((a+b)-delta)/(c*d))" |
# | | |
# | |
# -------------------------------------
#
# Note that the bar showing the &pos is positioned under the &posth
# character (actual positions are between characters). If &pos is
# at the end of &subject, the bar is positioned under the quotation
# mark delimiting the subject. For example,
#
# "abcdefgh" ? (tab(0) & snapshot())
#
# produces
#
# -------------------------
# | |
# | |
# | &subject = "abcdefgh" |
# | | |
# | |
# -------------------------
#
# Escape sequences are handled properly. For example,
#
# "abc\tdef\nghi" ? (tab(upto('\n')) & snapshot())
#
# produces
#
# ------------------------------
# | |
# | |
# | &subject = "abc\tdef\nghi" |
# | | |
# | |
# ------------------------------
#
# The title argument places a title into the top bar, as in
#
# "abc\tdef\nghi" ? (tab(upto('\n')) & snapshot("upto('\n')")
#
# which produces
#
# --upto('\n')-------------------
# | |
# | |
# | &subject = "abc\tdef\nghi" |
# | | |
# | |
# -------------------------------
#
# The len argument rewraps the display for a screen of len width.
#
############################################################################
procedure snapshot(title,len) #: snapshot of string scanning
local bar1, bar2, bar3, is, is0, prefix, titlel, placement, POS
/title := "" # no meaningful default
\len <:= 20 # any less is really not useful
prefix := "&subject = "
is := image(&subject)
is0 := *image(&subject[1:&pos]) | fail
#
# Set up top and bottom bars (not exceeding len width, if
# len is nonnull). Fit title into top bar (bar1).
#
bar1 := bar3 := repl("-", *is + *prefix + 4)[1:\len-4|0]
# in *is + *prefix + 4, the 4 is for two vbars/two spaces
titlel := (*title > *bar3-4) | *title[1:\len-4|0]
bar1 ?:= move(3) || (tab(4+titlel), title) || tab(0)
#
# Write bar1, then spacers (bar2). Then write out len-size chunks
# of &subject, with the | pointer-line, where appropriate. Finally,
# write out bar3 (like bar1, but with no title).
#
write(bar1)
bar2 := "|" || repl(" ", *bar3 - 2) || "|"
write(bar2, "\n", bar2)
placement := *prefix + is0
(prefix || is) ? {
until pos(0) do {
POS := &pos - 1
write("| ", move(*bar3-4) | left(tab(0), *bar3-4), " |")
if POS < placement < &pos then {
writes("| ")
writes(left(repl(" ", placement - POS - 1) || "|", *bar3-4))
write(" |\n", bar2)
}
else write(bar2, "\n", bar2)
}
}
write(bar3)
return # nothing useful to return
end
############################################################################
#
# David A. Gamey:
#
# balq( c1, c2, c3, c4, c5, s, i1, i2 ) : i3
#
# generates the sequence of integer positions in s preceding a
# character of c1 in s[i1:i2] that is (a) balanced with respect to
# characters in c2 and c3 and (b) not "quoted" by characters in c4
# with "escape" sequences as defined in c5, but
# fails if there is no such position.
#
# defaults: same as for bal,
# c4 the single and double quote characters ' and "
# c5 the backwards slash \
# errors: same as for bal,
# c4 & c5 not csets
#
# balqc( c1, c2, c3, c4, c5, s1, s2, s3, i1, i2 ) : i3
#
# like balq with the addition that balanced characters within
# "comments", as delimited by the strings s1 and s2, are also
# excluded from balancing. In addition, if s1 is given and s2
# is null then the comment terminates at the end of string.
#
# defaults: same as for balq,
# s3 is the subject string
# s1 "/*"
# s2 "*/" if s1 defaults, null otherwise
# errors: same as for balq,
# s1 is not a string
# s2 is not a string (if s1 defaults or is specified)
#
#############################################################################
procedure balq( #: bal() with quote escaping.
cstop, copen, cclose, cquote, cescape, s, i1, i2)
local quote, pcount, spos
local ca, c, sp
if /s := &subject then /i1 := &pos
/i1 := 1
/i2 := 0
/cstop := &cset # stopping characters
/copen := '(' # open characters
/cclose := ')' # close characters
/cquote := '\'\"' # quote characters
/cescape := '\\' # escape characters
pcount := 0 # "parenthesis" counter
spos := i1 # scanning position
ca := cstop ++ copen ++ cclose ++ cquote ++ cescape # characters to check
while sp := s[ spos := upto( ca, s, spos, i2 ) ] do {
if /quote & ( pcount = 0 ) & any( cstop, sp) then suspend spos
if any( c := ( copen | cclose | cquote | cescape ), sp ) then
case c of {
copen : if /quote then
pcount +:= 1
cclose : if /quote then
if ( pcount -:= 1 ) < 0 then
fail
cquote : if /quote then
quote := sp
else
if quote == sp then quote := &null
cescape: if \quote then
spos +:= 1
}
spos +:= 1
}
end
procedure balqc( #: balq() with comment escaping
cstop, copen, cclose, cquote, cescape, scm, ecm, s, i1, i2)
local quote, pcount, spos
local ca, c, sp
local ccom, comnt
if /s := &subject then /i1 := &pos
/i1 := 1
/i2 := 0
/cstop := &cset # stopping characters
/copen := '(' # open characters
/cclose := ')' # close characters
/cquote := '\'\"' # quote characters
/cescape := '\\' # escape characters
if /scm & /ecm then {
scm := "/*" # start of comment
ecm := "*/" # end of comment
}
else
if \scm & /ecm then
ecm := &null # icon style comment
ccom := ''
ccom ++:= cset(\scm[1])
ccom ++:= cset(\ecm[1])
pcount := 0 # "parenthesis" counter
spos := i1 # scanning position
ca := cstop ++ copen ++ cclose ++ cquote ++ cescape ++ ccom # chars to check
while sp := s[ spos := upto( ca, s, spos, i2 ) ] do {
if /quote & ( pcount = 0 ) & /comnt & any( cstop, sp) then
suspend spos
if any( c := ( copen | cclose | cquote | cescape | ccom ), sp ) then
case c of {
copen : if /quote & /comnt then
pcount +:= 1
cclose : if /quote & /comnt then
if ( pcount -:= 1 ) < 0 then
fail
cquote : if /comnt then
if /quote then
quote := sp
else
if quote == sp then quote := &null
cescape: if \quote then
spos +:= 1
ccom : if /quote then
if /comnt then {
if comnt := ( s[ spos +: *scm ] == scm ) then
spos +:= *scm - 1
}
else
if \ecm == s[ spos +: *ecm ] then {
spos +:= *ecm - 1
comnt := &null
}
}
spos +:= 1
}
end
#############################################################################
#
# This matching function illustrates how every can be
# used in string scanning.
#
# 1. Each element of the list argument is matched in
# succession.
# 2. Leading characters in the subject are skipped over
# to match the first element.
# 3. The strings listed may be seperated by other characters
# provided they are specified in a cset of characters to
# be ignored.
#
# It could be used to find things in text that have varying
# representations, for example: "i.e.", "e.g.", "P.O.", etc.
#
# limatch(l,i)
#
# l list of strings to be found
# i cset containing characters to be ignored between each string
#
# returns the last cursor position scanned to, or fails
#
#############################################################################
procedure limatch(l,i) #: matching items in list
local s, f, p
p := &pos
every ( s := !l ) | ( return p ) do
{
if /f := 1 then tab(find(s)) # startup - position at first string
tab(match(s)) | fail # fail if not matched
tab(many(i) | &pos) # skip ignore chars. if any
p := &pos # remember last position
}
end
This page produced by UniDoc on 2021/04/15 @ 23:59:43.