Skip to content

Commit a6d20c0

Browse files
committed
Follow up after 4d68817
1 parent d5d942b commit a6d20c0

File tree

9 files changed

+102
-122
lines changed

9 files changed

+102
-122
lines changed

src/ifcopenshell-python/ifcopenshell/__init__.py

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -299,20 +299,44 @@ def guess_format(path: Path) -> Literal[".ifc", ".ifcZIP", ".ifcXML", ".ifcJSON"
299299
return None
300300

301301

302-
def stream2(path: Union[Path, str]):
302+
def stream2(path: Union[Path, str], mmap: bool = False, page_size: int = 0):
303303
"""Streams the content of a file path from disk, yielding each instance
304304
as a dictionary.
305305
306306
Args:
307307
path (Union[Path, str]): input file path
308+
mmap (bool): open the file contents using memory mapping
309+
page_size (int): open file in python and feed chunks to the parser
308310
309311
Yields:
310312
dict: entity instance dictionaries
311313
"""
312-
streamer = ifcopenshell_wrapper.InstanceStreamer(str(path))
313-
while streamer:
314-
if inst := streamer.read_instance_py():
315-
yield inst
314+
if page_size:
315+
import builtins
316+
317+
f = builtins.open(path, encoding="ascii")
318+
strm = ifcopenshell_wrapper.InstanceStreamer()
319+
strm.pushPage(f.read(page_size))
320+
finished = False
321+
while True:
322+
while strm.hasSemicolon():
323+
if inst := strm.readInstancePy():
324+
yield inst
325+
else:
326+
finished = True
327+
break
328+
if finished:
329+
break
330+
else:
331+
if data := f.read(page_size):
332+
strm.pushPage(data)
333+
else:
334+
break
335+
else:
336+
streamer = ifcopenshell_wrapper.InstanceStreamer(str(path), mmap)
337+
while streamer:
338+
if inst := streamer.readInstancePy():
339+
yield inst
316340

317341

318342
def stream2_from_string(data: str):

src/ifcopenshell-python/test/test_streaming_rocksdb_and_simpletyperefs.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,11 @@ def test_stream():
3232
"value": ({"ref": 136}, {"ref": 138}),
3333
}
3434

35+
def test_chunked_stream():
36+
assert list(ifcopenshell.stream2(fn)) == list(ifcopenshell.stream2(fn, page_size=1024))
37+
38+
def test_mmaped_stream():
39+
assert list(ifcopenshell.stream2(fn)) == list(ifcopenshell.stream2(fn, mmap=True))
3540

3641
def test_file():
3742
f = ifcopenshell.open(fn)

src/ifcparse/FileReader.cpp

Lines changed: 26 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,8 @@
1111
#include <vector>
1212
#include <deque>
1313

14-
// Boost iostreams mmap
1514
#include <boost/iostreams/device/mapped_file.hpp>
1615

17-
18-
1916
namespace {
2017

2118
#if defined(_WIN32)
@@ -32,8 +29,6 @@ namespace {
3229

3330
} // namespace
3431

35-
// ===================== Concrete backends =====================
36-
3732
using namespace IfcParse;
3833

3934
struct FullBufferImpl final : FileReader::Impl {
@@ -64,7 +59,7 @@ struct PagedFileImpl final : FileReader::Impl {
6459

6560
// LRU cache
6661
size_t capacity_ = 8;
67-
mutable std::list<size_t> lru_; // most recent at front
62+
mutable std::list<size_t> lru_;
6863
struct Entry {
6964
FileReader::Page page;
7065
std::list<size_t>::iterator it;
@@ -90,16 +85,14 @@ struct PagedFileImpl final : FileReader::Impl {
9085
char get(size_t pos) const override {
9186
if (pos >= file_size_) throw std::out_of_range("get out of range");
9287
const size_t pidx = pos / page_size_;
93-
const FileReader::Page& p = fetch_page_(pidx);
88+
const FileReader::Page& p = fetchPage_(pidx);
9489
const size_t off = pos % page_size_;
9590
if (off >= p.data.size()) throw std::out_of_range("offset beyond valid page bytes");
96-
// Opportunistic read-ahead for sequential scans
97-
// if (off + 1 == p.data.size()) (void)try_prefetch_(pidx + 1);
9891
return p.data[off];
9992
}
10093

10194
private:
102-
const FileReader::Page& fetch_page_(size_t idx) const {
95+
const FileReader::Page& fetchPage_(size_t idx) const {
10396
auto it = map_.find(idx);
10497
if (it != map_.end()) {
10598
touch_(it);
@@ -116,7 +109,8 @@ struct PagedFileImpl final : FileReader::Impl {
116109
const size_t nread = std::fread(pg.data.data(), 1, avail, fp_);
117110
if (nread != avail) throw std::runtime_error("Short fread on page");
118111
}
119-
pg.data.resize(avail); // trim to actual size
112+
// trim to actual size
113+
pg.data.resize(avail);
120114

121115
// Insert into LRU
122116
if (map_.size() >= capacity_) evict_();
@@ -127,16 +121,6 @@ struct PagedFileImpl final : FileReader::Impl {
127121
return emplaced_it->second.page;
128122
}
129123

130-
/*
131-
bool try_prefetch_(size_t idx) const {
132-
if (idx * page_size_ >= file_size_) return false;
133-
if (map_.find(idx) != map_.end()) return true;
134-
if (map_.size() + 1 > capacity_) return false;
135-
(void)fetch_page_(idx);
136-
return true;
137-
}
138-
*/
139-
140124
void touch_(typename std::unordered_map<size_t, Entry>::iterator it) const {
141125
lru_.erase(it->second.it);
142126
lru_.push_front(it->first);
@@ -189,7 +173,7 @@ struct PushedSequentialImpl final : std::enable_shared_from_this<PushedSequentia
189173
}
190174

191175
// Drop fully-consumed pages so pos is guaranteed to be within the first page
192-
void drop_consumed_up_to(size_t pos) {
176+
void dropPages(size_t pos) override {
193177
while (!pages_.empty()) {
194178
if (pos - discarded_page_bytes_ >= pages_.front().data.size()) {
195179
discarded_page_bytes_ += pages_.front().data.size();
@@ -202,11 +186,17 @@ struct PushedSequentialImpl final : std::enable_shared_from_this<PushedSequentia
202186

203187
char get(size_t pos) const override {
204188
auto self = const_cast<PushedSequentialImpl*>(this);
189+
190+
/*
191+
// We do not do this automatically because all variable width tokens:
192+
// ENUM/STRING/BINARY/KEYWORD are stored as file offsets until a full
193+
// entity instance is finalized.
205194
if (this->shared_from_this().use_count() == 2) {
206195
// only drop pages when there is only one active client.
207196
// NB this->shared_from_this() increases count by 1
208197
self->drop_consumed_up_to(pos);
209198
}
199+
*/
210200

211201
const size_t avail_end = size();
212202
if (pos >= avail_end) throw std::out_of_range("pushed backend: position not committed yet");
@@ -226,14 +216,12 @@ struct PushedSequentialImpl final : std::enable_shared_from_this<PushedSequentia
226216
throw std::out_of_range("pushed backend: internal inconsistency");
227217
}
228218

229-
void push_next_page(const std::string& data) override {
219+
void pushNextPage(const std::string& data) override {
230220
FileReader::Page p; p.data.assign(data.data(), data.data() + data.size());
231221
pages_.push_back(std::move(p));
232222
}
233223
};
234224

235-
// ===================== FileReader public API =====================
236-
237225
IfcParse::FileReader::FileReader(const std::string& fn)
238226
: cursor_(0)
239227
{
@@ -255,7 +243,7 @@ IfcParse::FileReader::FileReader(const caller_fed_tag&)
255243
IfcParse::FileReader::FileReader(const std::string& content, const caller_fed_tag&)
256244
{
257245
impl_ = std::make_shared<PushedSequentialImpl>();
258-
impl_->push_next_page(content);
246+
impl_->pushNextPage(content);
259247
}
260248

261249
IfcParse::FileReader::FileReader(const std::string& fn, size_t page_size, size_t page_capacity)
@@ -289,9 +277,19 @@ void FileReader::increment(size_t n) {
289277
cursor_ += n;
290278
}
291279

292-
void IfcParse::FileReader::push_next_page(const std::string& data)
280+
void IfcParse::FileReader::pushNextPage(const std::string& data)
281+
{
282+
impl_->pushNextPage(data);
283+
}
284+
285+
void IfcParse::FileReader::dropPages()
286+
{
287+
impl_->dropPages(0);
288+
}
289+
290+
void IfcParse::FileReader::dropPages(size_t up_to_pos)
293291
{
294-
impl_->push_next_page(data);
292+
impl_->dropPages(up_to_pos);
295293
}
296294

297295
bool IfcParse::FileReader::eof() const

src/ifcparse/FileReader.h

Lines changed: 11 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -29,61 +29,6 @@
2929

3030
#include "ifc_parse_api.h"
3131

32-
/*
33-
#include <string>
34-
35-
#ifdef USE_MMAP
36-
#include <boost/iostreams/device/mapped_file.hpp>
37-
#endif
38-
39-
namespace IfcParse {
40-
/// The FileReader class represents a ISO 10303-21 IFC-SPF file in memory.
41-
/// The file is interpreted as a sequence of tokens which are lazily
42-
/// interpreted only when requested.
43-
class IFC_PARSE_API FileReader {
44-
private:
45-
#ifdef USE_MMAP
46-
boost::iostreams::mapped_file_source mfs;
47-
#endif
48-
FILE* stream_;
49-
const char* buffer_;
50-
size_t ptr_;
51-
size_t len_;
52-
size_t buf_size_;
53-
size_t ptr_offset_ = 0;
54-
55-
public:
56-
bool valid;
57-
bool eof;
58-
size_t size;
59-
60-
FileReader(const std::string& path, bool mmap = false, size_t buf_size = 0);
61-
FileReader(std::istream& stream, int length);
62-
FileReader(void* data, int length);
63-
~FileReader();
64-
/// Returns the character at the cursor
65-
char peek();
66-
/// Returns the character at specified offset
67-
char Read(size_t offset);
68-
/// Increment the file cursor and reads new page if necessary
69-
void increment();
70-
void Close();
71-
/// Moves the file cursor to an arbitrary offset in the file
72-
void seek(size_t offset);
73-
/// Returns the cursor position
74-
size_t Tell() const;
75-
76-
bool is_eof_at(size_t) const;
77-
void increment_at(size_t&);
78-
char peek_at(size_t);
79-
80-
operator bool() const { return valid && !eof; }
81-
};
82-
} // namespace IfcParse
83-
84-
#endif
85-
*/
86-
8732
#include <cstddef>
8833
#include <memory>
8934
#include <optional>
@@ -147,8 +92,13 @@ class IFC_PARSE_API FileReader {
14792

14893
/// \brief Push the next sequential page (pushed backend only).
14994
/// \param data Contents of the page.
150-
/// \throws std::logic_error if the current backend is not pushed mode, or if a next page is already queued.
151-
void push_next_page(const std::string& data);
95+
/// \throws std::logic_error if the current backend is not in pushed mode
96+
void pushNextPage(const std::string& data);
97+
98+
/// \brief Drops pages up to cursor position or provided offset. Does nothing when current backend is not in pushed mode
99+
/// \param up_to_pos Pages with an end offset before up_to_pos are dropped from memory
100+
void dropPages();
101+
void dropPages(size_t up_to_pos);
152102

153103
/// \brief Returns true if the cursor is at or beyond the end of available data.
154104
/// For the pushed backend, EOF means all pushed bytes have been consumed.
@@ -165,9 +115,12 @@ class IFC_PARSE_API FileReader {
165115
virtual size_t size() const = 0;
166116
virtual char get(size_t pos) const = 0;
167117
/// \brief Backend may support pushing pages; default throws.
168-
virtual void push_next_page(const std::string&) {
118+
virtual void pushNextPage(const std::string&) {
169119
throw std::logic_error("push_next_page: backend does not support pushed mode");
170120
}
121+
virtual void dropPages(size_t) {
122+
// empty on purpose
123+
}
171124
};
172125

173126
private:

src/ifcparse/IfcFile.cpp

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -651,9 +651,7 @@ IfcParse::filetype IfcParse::guess_file_type(const std::string& fn) {
651651
}
652652
}
653653

654-
std::optional<std::tuple<size_t, const IfcParse::declaration*, IfcEntityInstanceData>> IfcParse::InstanceStreamer::read_instance() {
655-
// std::cout << "global: " << stream_->tell() << std::endl;
656-
654+
std::optional<std::tuple<size_t, const IfcParse::declaration*, IfcEntityInstanceData>> IfcParse::InstanceStreamer::readInstance() {
657655
std::optional<std::tuple<size_t, const IfcParse::declaration*, IfcEntityInstanceData>> return_value;
658656

659657
if (header_ && yielded_header_instances_ < 3) {
@@ -741,11 +739,12 @@ std::optional<std::tuple<size_t, const IfcParse::declaration*, IfcEntityInstance
741739
break;
742740
}
743741

744-
// std::cout << next_token.startPos << " " << TokenFunc::toString(next_token) << std::endl;
745-
746742
token_stream_.push_back(next_token);
747743
}
748744

745+
// Free pages in front of cursor when variable-width tokens are materialized into entity instance data objects
746+
(stream_ ? stream_ : (lexer_)->stream)->dropPages();
747+
749748
return return_value;
750749
}
751750

src/ifcparse/IfcFile.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -126,13 +126,13 @@ class IFC_PARSE_API InstanceStreamer {
126126
return storage_.byref_excl_;
127127
}
128128

129-
std::vector<std::unique_ptr<IfcUtil::IfcBaseClass>> steal_instances() {
129+
std::vector<std::unique_ptr<IfcUtil::IfcBaseClass>> stealInstances() {
130130
return storage_.steal_instances();
131131
}
132132

133-
bool has_semicolon() const;
133+
bool hasSemicolon() const;
134134

135-
void push_page(const std::string& page);
135+
void pushPage(const std::string& page);
136136

137137
InstanceStreamer();
138138

@@ -150,7 +150,7 @@ class IFC_PARSE_API InstanceStreamer {
150150
delete header_;
151151
}
152152

153-
std::optional<std::tuple<size_t, const IfcParse::declaration*, IfcEntityInstanceData>> read_instance();
153+
std::optional<std::tuple<size_t, const IfcParse::declaration*, IfcEntityInstanceData>> readInstance();
154154
};
155155

156156
/// This class provides access to the entity instances in an IFC file

0 commit comments

Comments
 (0)