Source file blockread.icn |
#<p>
# Read in large ASCII files efficiently as large strings.
# Newlines are embedded in the strings, which always end
# at a new line.
#</p>
#<p>
# <b>Author:</b> Steve Wampler (<i>sbw@tapestry.tucson.az.us</i>)
#</p>
#<p>
# This file is in the <i>public domain</i>.
#</p>
import lang
package util
#<p>
# Provide an encapsulation class for reading in an ASCII text
# file in large chunks, but guarantee an integral number of
# lines in each chunk. (I.e. each chunk ends in a newline).
#</p>
#<p>
# This is a specialized class for efficiently reading in large
# text files. For example, comparing the performance of the
# following programs on the same hardware and input file:
#</p>
#<p>
#<pre>
# procedure main(args)
# i := 0
# every i +:= (!&input, 1)
# write(&errout, i)
# end
#</pre>
#and
#<pre>
# import util
#
# procedure main(args)
# bRead := BlockRead()
# i := 0
# while s := bRead.readBlock() do {
# every i +:= (upto('\n',s),1)
# }
# write(&errout, i)
# end
#</pre>
#</p>
#<p>
# The second program runs (on sufficiently large files) three times faster.
# <i>(Note that this example is contrived, as using the function reads()
# would work just as well and would be slightly faster...)</i>
#</p>
#<p>
# The following program cannot easily be replaced by one using reads():
#<pre>
# import util
#
# procedure main(args)
# wFreq := table(0)
# every wFreq[genWords(BlockRead().genBlocks())] +:= 1
# every wPair := !reverse(sort(wFreq,2)) \ 20 do {
# write(right(wPair[2],10),": ",wPair[1])
# }
# end
#</pre>
#and can be almost twice as fast (<i>see caveat one</i>, below) as:
#<pre>
# import util
#
# procedure main(args)
# wFreq := table(0)
# every wFreq[genWords(!&input)] +:= 1
# every wPair := !reverse(sort(wFreq,2)) \ 20 do {
# write(right(wPair[2],10),": ",wPair[1])
# }
# end
#</pre>
#<p>
# <b>Caveat one:</b> It is not always better to read in large chunks of lines -
# the actions you perform on those chunks have a large influence on
# overall program efficiency and using large chunks may, in some cases,
# slow your program down! Choosing a good block size for use in your
# application is an art.
#</p>
#<p>
# <b>Caveat two:</b> You cannot easily mix reads using this class with
# reads of the same file using other functions.
#</p>
class BlockRead : Object (f, bSize, buffer)
#<p>
# Read in at most <b>n</b> characters from the file. However,
# always terminate the read at the last newline prior to reaching
# <b>n</b> characters.
#</p>
#<p>
# <[param n maximum amount to read. Defaults to <tt>blockSize</tt>]>
# <[returns string of upto <tt>n</tt> characters from the file,
# terminating with a newline]>
# <[fails if unable to read any characters]>
#</p>
method readBlock(n)
/n := bSize
if s := ::reads(f, n) then {
i := *s
while (i > 0) & (s[i] ~== "\n") do {
i -:= 1
}
nbuf := buffer
buffer := s[i+1:0]
return nbuf || s[1+:i]
}
else if *buffer > 0 then {
nbuf := buffer
buffer := ""
return nbuf
}
else fail
end
#<p>
# Generate blocks of at most <b>n</b> characters from the file,
# using the same criteria for defining a block as in <b>readBlock</b>.
#</p>
#<p>
# <[param n maximum amount to read. Defaults to <tt>blockSize</tt>]>
# <[generates blocks from the file as strings]>
#</p>
method genBlocks(n)
suspend |readBlock(n)
end
#<p>
# Produce the current <i>read-ahead</i> buffer. This buffer contains
# input characters that were read on the previous block read, but
# followed the last newline in the block.
# <[returns read-ahead buffer]>
#</p>
#<p>
# This is a convenience method to help when input needs to be
# mixed with non-block reads.
#</p>
method getReadahead()
return buffer
end
#<p>
# Set the current <i>read-ahead</i> buffer. The previous value is
# lost. It is difficult to imagine a use for this method except
# to empty the preread when mixing block reads with normal input
# operations.
# <[param nBuf new contents for the read-ahead buffer]>
#</p>
method setReadahead(nBuf)
buffer := nBuf
end
#<p>
# Provide an instance of <b>BlockRead</b> for reading in
# <b>blockSize</b> chunks at a time from <b>fileName</b>.
# <[param fileOrName file or filename to read from. Defaults to
# <tt>&input</tt>.]>
# <[para blockSize default blocksize for this instance.
# Defaults to <tt>409600</tt>.]>
#</p>
#<p>
# The first argument may be an already opened file.
#</p>
initially (fileOrName, blockSize)
if /fileOrName then {
f := &input
}
else {
if ::type(fileOrName) == "string" then {
f := ::open(fileOrName) |
::stop("BlockRead: Cannot open '",fileOrName,"'!")
}
else {
f := fileOrName # Assume it's a file
}
}
/blockSize := 409600 # default to 1/2 MB reads
bSize := blockSize
buffer := ""
end
This page produced by UniDoc on 2021/04/15 @ 23:59:44.