############################################################################
#
# File: ngrams.icn
#
# Subject: Procedures to produce n-grams
#
# Author: Ralph E. Griswold
#
# Date: March 20, 1998
#
############################################################################
#
# This file is in the public domain.
#
############################################################################
#
# The procedure ngrams(s, n, c, t) generates a tabulation of the n-grams
# in the specified string. If c is non-null, it is used as the set of
# characters from which n-grams are taken (other characters break n-grams).
# The default for c is the upper- and lowercase letters. If t is non-null,
# the tabulation is given in order of frequency; otherwise in alphabetical
# order of n-grams.
#
# For backward compatibility, the first argument may be a file, in
# which case, it is read to provide the string.
#
############################################################################
procedure ngrams(s, i, c, t) #: n-grams with count
local line, grams, a, count, f
if not (integer(i) > 0) then stop("*** invalid ngrams specification")
/c := &lcase || &ucase
if not (c := cset(c)) then stop("*** invalid cset specification")
grams := table(0)
if type(s) == "file" then {
line := ""
while line ||:= reads(f, 1000)
}
else line := s
line ? while tab(upto(c)) do
(tab(many(c)) \ 1) ? while grams[move(i)] +:= 1 do
move(-i + 1)
if /t then {
a := sort(grams, 4)
while count := pull(a) do
suspend pull(a) || right(count, 8)
}
else {
a := sort(grams, 3)
suspend |(get(a) || right(get(a),8))
}
end
procedure ngramset(s, i, c) #: n-grams set
local line, grams, a, count, f
if not (integer(i) > 0) then stop("*** invalid ngrams specification")
/c := &lcase || &ucase
if not (c := cset(c)) then stop("*** invalid cset specification")
grams := set()
if type(s) == "file" then {
line := ""
while line ||:= reads(f, 1000)
}
else line := s
line ? while tab(upto(c)) do
(tab(many(c)) \ 1) ? while insert(grams, move(i)) do
move(-i + 1)
return grams
end
This page produced by UniDoc on 2021/04/15 @ 23:59:44.