1 % (c) 2020-2024 Lehrstuhl fuer Softwaretechnik und Programmiersprachen,
2 % Heinrich Heine Universitaet Duesseldorf
3 % This software is licenced under EPL 1.0 (http://www.eclipse.org/org/documents/epl-v10.html)
4
5 :- module(tools_fastread, [ read_term_from_stream/2,
6 read_term_from_string/2,
7 read_test/1, tok_test/1,
8 fastrw_read/3
9 ]).
10
11 :- use_module(module_information,[module_info/2]).
12
13 :- module_info(group,infrastructure).
14 :- module_info(description,'This module provides a more efficient way to parse and read large Prolog terms.').
15
16 :- use_module(self_check).
17 :- use_module(error_manager,[add_error/3]).
18
19
20 :- use_module(error_manager).
21
22
23 :- use_module(library(fastrw),[fast_read/2]).
24 % this reads terms in binary format
25 fastrw_read(S,Term,Error) :-
26 catch( fast_read(S,Term1), % from library fastrw
27 error(E,_),
28 ( E=syntax_error(_) -> Error = true
29 ; E=permission_error(_,_,_) -> Term1 = end_of_file
30 ; E=consistency_error(_,_,_)
31 -> add_error(tools_fastread,'Consistency error when reading from stream: ',E),
32 Error = true
33 ;
34 add_error(tools_fastread,'Unknown error when reading from stream: ',E),
35 throw(E))
36 ),
37 Term1 = Term.
38
39
40
41
42 :- set_prolog_flag(double_quotes, codes).
43
44 % Limitations:
45 % - only prefix notation and lists supported
46 % - supports atoms with and without quotes, in quotes one can use octal and hex literals and backslash escaping
47 % - no support for variables (yet?)
48 % - no support for operator declarations
49 % - should work with numbers (integers and floats)
50 % - no real support for (a,b) terms using , as infix
51 % - no support for block comments
52
53
54 % --------------------
55
56 % Tokeniser
57
58
59 :- use_module(tools,[print_memory_used_wo_gc/0]).
60 tok_read(File) :- statistics(walltime,_),
61 %open(File,read,Stream1),reset_line_nr, repeat, get_code(Stream1,C), C = -1,!, close(Stream1),
62 %statistics(walltime,[_,W1]), format('~n% Walltime ~w ms to get_codes ~w~n',[W1,File]),
63 open(File,read,Stream),reset_line_nr,
64 read_tokens(Stream),close(Stream),
65 statistics(walltime,[_,W2]), format('~n% Walltime ~w ms to tokenise ~w~n',[W2,File]),
66 print_memory_used_wo_gc.
67
68 % set_prolog_flag(profiling,on)
69 tok_test(1) :-
70 tok_read('/Users/leuschel/git_root/prob_examples/public_examples/B/Demo/Bakery0.prob').
71 tok_test(2) :-
72 tok_read('/Users/leuschel/git_root/prob_examples/public_examples/B/Benchmarks/CAN_BUS_tlc.prob').
73 tok_test(3) :-
74 tok_read('/Users/leuschel/git_root/prob_examples/public_examples/B/PerformanceTests/Generated/Generated4000.prob').
75 tok_test(4) :-
76 tok_read('/Users/leuschel/git_root/prob_examples/public_examples/B/PerformanceTests/Generated/Generated10000.prob').
77 tok_test(7) :-
78 tok_read('/Users/leuschel/git_root/private_examples/ClearSy/2019_Nov/rule_perf4/rule_split.prob').
79
80 read_tokens(Stream) :-
81 get_next_token(Stream,Tok), % print_token(Tok),
82 !,
83 (Tok = eof -> true
84 ; read_tokens(Stream)).
85 read_tokens(Stream) :-
86 get_code(Stream,C),
87 format('Could not process: ~w~n',[C]).
88
89 %print_token(atom(L)) :- !, format('Token: Atom ~s~n',[L]).
90 %print_token(S) :- !, format('Token: ~w~n',[S]).
91
92 % get next Prolog token from a stream
93 % TO DO: produce a version without peek, which returns first code following token
94 get_next_token(Stream,Token) :- get_code(Stream,C), !,
95 ? get_next_token1(C,Stream,Token).
96
97
98 get_next_token1(-1,_,eof).
99 get_next_token1(32,Stream,Token) :- !, get_next_token(Stream,Token).
100 get_next_token1(10,Stream,Token) :- !, inc_line_nr, get_next_token(Stream,Token).
101 get_next_token1(13,Stream,Token) :- !, get_next_token(Stream,Token).
102 get_next_token1(0'(,_,open_paren). % adding a cut here seems to degrade performance, but leads to backtracking
103 get_next_token1(0'),_,close_paren).
104 get_next_token1(0'[,_,open_bracket).
105 get_next_token1(0'],_,close_bracket).
106 get_next_token1(0',,_,comma).
107 get_next_token1(0'.,_,dot).
108 get_next_token1(39,Stream,atom(T)) :- !, get_quoted_atom_codes(Stream,T).
109 get_next_token1(0'%,Stream,Token) :- !, % line comment
110 skip_until_end_of_line(Stream),!,get_next_token(Stream,Token).
111 get_next_token1(X,Stream,number([X|T])) :- is_digit(X),!, get_number_codes(Stream,T).
112 get_next_token1(X,Stream,atom([X|T])) :- is_lowcase(X),!, get_atom_codes(Stream,T).
113 get_next_token1(X,_,_) :- prolog_code_error(X),fail.
114
115 get_number_codes(Stream,[H|T]) :- peek_code(Stream,H), is_digit(H),
116 !, get_code(Stream,_), get_number_codes(Stream,T).
117 get_number_codes(_,[]).
118 % TO DO: also accept floats
119
120 get_atom_codes(Stream,[H|T]) :-
121 ? peek_code(Stream,H), is_unquoted_atom_letter(H),!,
122 get_code(Stream,_), get_atom_codes(Stream,T).
123 get_atom_codes(_,[]).
124
125 get_quoted_atom_codes(Stream,Res) :- get_code(Stream,H),
126 ( H = 92 -> get_code(Stream,H2), % backslash: read next code and treat it specially
127 treat_escaped_code(H2,Stream,Res)
128 ; H = 39 -> Res=[]
129 ; H = -1 -> format('Unterminated quoted atom: ~w~n',[H]),fail
130 ; Res=[H|T], get_quoted_atom_codes(Stream,T)).
131
132 skip_until_end_of_line(Stream) :- get_code(Stream,H),
133 (H = 10 -> inc_line_nr
134 ; H = -1 -> format('Unterminated % comment~n',[]), prolog_token_error(H), fail
135 ; skip_until_end_of_line(Stream)).
136
137 % see page 60, Section 4.1.7.6 Escape Sequences in SICStus Prolog manual
138 treat_escaped_code(Code,Stream,[NewCode|T]) :- escape_conversion(Code,NewCode),!,
139 get_quoted_atom_codes(Stream,T).
140 treat_escaped_code(Octal,Stream,[OctResult|T]) :- is_octal_digit(Octal,Nr),!, % treat chars like \374\
141 get_code(Stream,NxtCode),
142 get_octal_codes(NxtCode,Stream,Nr,OctResult),
143 get_quoted_atom_codes(Stream,T).
144 treat_escaped_code(0'x,Stream,[HexResult|T]) :- !, % treat chars like \x21D4\
145 get_code(Stream,NxtCode),
146 get_hex_codes(NxtCode,Stream,0,HexResult),
147 get_quoted_atom_codes(Stream,T).
148 treat_escaped_code(10,Stream,T) :- !, %\LFD ignored ; TO DO: also support 13
149 get_quoted_atom_codes(Stream,T).
150 treat_escaped_code(-1,_,_) :- !, format('Unterminated quoted atom ending with backslash at: ~w~n',[end_of_file]),fail.
151 treat_escaped_code(X,_,_) :- prolog_code_error('escape sequence',X),fail.
152
153 escape_conversion(39,39). % single quote '
154 escape_conversion(34,34). % double quote "
155 escape_conversion(92,92).
156 escape_conversion(0'n,10).
157 escape_conversion(0't,9).
158 escape_conversion(0'b,8).
159 escape_conversion(0'r,13).
160
161
162 get_octal_codes(H,Stream,Acc,Res) :- is_octal_digit(H,HNr),
163 !, A2 is Acc*8+HNr,
164 get_code(Stream,NxtCode), get_octal_codes(NxtCode,Stream,A2,Res).
165 get_octal_codes(92,_,Acc,Res) :- !, Res=Acc.
166 get_octal_codes(X,_,_,_) :- prolog_code_error('octal literal',X),fail.
167
168 get_hex_codes(H,Stream,Acc,Res) :- is_hex_digit(H,HNr),
169 !, A2 is Acc*16+HNr,
170 get_code(Stream,NxtCode), get_hex_codes(NxtCode,Stream,A2,Res).
171 get_hex_codes(92,_,Acc,Res) :- !, Res=Acc.
172 get_hex_codes(X,_,_,_) :- prolog_code_error('hex literal',X),fail.
173
174
175
176 % 45 = leading minus sign
177 is_digit(X) :- X=45 ; (X>=48, X=<57).
178
179 :- assert_must_succeed(tools_fastread:is_octal_digit(0'7,7)).
180 is_octal_digit(X,Nr) :- X>=48, X=<55, Nr is X-48.
181 :- assert_must_succeed(tools_fastread:is_hex_digit(0'f,15)).
182 :- assert_must_succeed(tools_fastread:is_hex_digit(0'F,15)).
183 :- assert_must_succeed(tools_fastread:is_hex_digit(0'9,9)).
184 is_hex_digit(X,Nr) :- X>=48, X=<57, Nr is X-48.
185 is_hex_digit(X,Nr) :- X>=65, X=<70, Nr is X-55. % upper-case A-F
186 is_hex_digit(X,Nr) :- X>=97, X=<102, Nr is X-87. % lower-case a-f
187 is_lowcase(X) :- X>=97, X=<122.
188
189 is_unquoted_atom_letter(X) :- X>=97, X=<122.
190 is_unquoted_atom_letter(X) :- X>=65, X=<90.
191 is_unquoted_atom_letter(X) :- X=95.
192 is_unquoted_atom_letter(X) :- X>=48, X=<57.
193
194 %is_quoted_atom_letter(X) :- X \= 39. %, X \=10, X \=13.
195
196 % skip over whitespace and return code after whitspace (peeked)
197 skip_ws(Stream,NextCode) :- peek_code(Stream,C),
198 (is_skip_whitespace(C) -> get_code(Stream,_),skip_ws(Stream,NextCode)
199 ; NextCode=C).
200
201 is_skip_whitespace(32).
202 is_skip_whitespace(10) :- inc_line_nr.
203 is_skip_whitespace(13).
204
205 % ------------------------
206
207 ?expect_token(Stream,ETok) :- get_next_token(Stream,RTok),!,
208 (RTok = ETok -> true
209 ; cur_line(L),
210 format(user_error,'Unexpected token on line ~w: ~w, expected ~w~n',[L,RTok,ETok]),
211 fail).
212
213 prolog_token_error(Tok) :- cur_line(L),
214 %format(user_error,'Unexpected token on line ~w: ~w~n',[L,Tok]),
215 add_error(fastread,'Unexpected token on line:',Tok:L).
216 prolog_code_error(Code) :- cur_line(L),
217 format(user_error,'Unexpected character on line ~w: ~w "~s"~n',[L,Code,[Code]]),
218 add_error(fastread,'Unexpected character code on line:',Code:line(L)).
219 prolog_code_error(Kind,Code) :- cur_line(L),
220 format(user_error,'Unexpected character for ~w on line ~w: ~w "~s"~n',[Kind,L,Code,[Code]]),
221 add_error(fastread,'Unexpected character code on line:',Kind:Code:line(L)).
222
223 :- dynamic cur_line/1.
224 cur_line(1).
225 inc_line_nr :- retract(cur_line(N)), N1 is N+1, assertz(cur_line(N1)).
226 reset_line_nr :- retract(cur_line(_)), assertz(cur_line(1)).
227
228 % ------------------------
229
230 :- assert_must_succeed(tools_fastread:read_term_from_string("atom('a',1,2).",atom(a,1,2))).
231 :- assert_must_succeed(tools_fastread:read_term_from_string("'\\''.",'\'')).
232 :- assert_must_succeed(tools_fastread:read_term_from_string("'\\374\\'.",'\374\')).
233
234 :- use_module(library(codesio),[open_codes_stream/2]).
235 read_term_from_string(Codes,Term) :-
236 open_codes_stream(Codes,Stream),
237 read_term_from_stream(Stream,Term),
238 close(Stream).
239
240
241 % read next term on stream, term has to be followed by '.'
242 read_term_from_stream(Stream,Term) :-
243 get_next_token(Stream,Token),!,
244 ? get_next_term(Token,Stream,Term),!,
245 (Term='end_of_file' -> true
246 ; expect_token(Stream,dot)).
247
248 % get one term, without dot at end:
249 get_next_term(eof,_,'end_of_file').
250 get_next_term(number(C),_,Term) :- number_codes(Term,C).
251 ?get_next_term(atom(C),Stream,Term) :- atom_codes(Atom,C), complete_atom(Atom,Stream,Term).
252 get_next_term(open_bracket,Stream,List) :- get_term_list(Stream,close_bracket,List).
253 get_next_term(open_paren,Stream,'$pair'(List)) :- get_term_list(Stream,close_paren,List).
254 get_next_term(close_paren,_,_) :- prolog_token_error(dot),fail.
255 get_next_term(close_bracket,_,_) :- prolog_token_error(dot),fail.
256 get_next_term(comma,_,_) :- prolog_token_error(dot),fail.
257 get_next_term(dot,_,_) :- prolog_token_error(dot),fail.
258
259 % check if an atom is followed by a list of arguments (Term1,...)
260 complete_atom(Atom,Stream,Term) :- skip_ws(Stream,C), C==0'(, !,
261 get_code(Stream,_),
262 % TO DO: special case for binary operators like conjunct, transform into flat list (associativity holds) meaning we can get tail-recursion
263 ? get_term_list(Stream,close_paren,List),
264 Term =.. [Atom|List].
265 complete_atom(Atom,_,Atom). % atom without parentheses, i.e., on its own
266
267 % pre-construct term for certain functors to make above call tail-recursive
268 % however, as conjunctions are often stored the other way conjunct(pos,conjunct(...),Pred) this is not really beneficial
269 % we could have a special get_conjunct which creates a list
270 known_arity(empty_set,empty_set(Pos),[Pos]).
271 known_arity(boolean_true,boolean_true(Pos),[Pos]).
272 known_arity(boolean_false,boolean_false(Pos),[Pos]).
273 known_arity(identifier,identifier(Pos,ID),[Pos,ID]).
274 known_arity(integer,integer(Pos,I),[Pos,I]).
275 known_arity(string,string(Pos,I),[Pos,I]).
276 known_arity(equal,equal(Pos,A,B),[Pos,A,B]).
277 known_arity(not_equal,not_equal(Pos,A,B),[Pos,A,B]).
278 %known_arity(couple,couple(Pos,A,B),[Pos,A,B]). % couple(Pos,List)
279 known_arity(couple,couple(Pos,L),[Pos,L]).
280 known_arity(set_extension,set_extension(Pos,L),[Pos,L]).
281 known_arity(sequence_extension,sequence_extension(Pos,L),[Pos,L]).
282 known_arity(conjunct,conjunct(Pos,A,B),[Pos,A,B]).
283 known_arity(disjunct,disjunct(Pos,A,B),[Pos,A,B]).
284 known_arity(interval,interval(Pos,A,B),[Pos,A,B]).
285 known_arity(member,member(Pos,A,B),[Pos,A,B]).
286 known_arity(function,function(Pos,A,B),[Pos,A,B]).
287 known_arity(assign,assign(Pos,A,B),[Pos,A,B]).
288 known_arity(p3,p3(A,B,C),[A,B,C]).
289 known_arity(p4,p4(A,B,C,D),[A,B,C,D]).
290 known_arity(p5,p5(A,B,C,D,E),[A,B,C,D,E]).
291
292 % get list of terms separated by , and terminated by provided token
293 ?get_term_list(Stream,EndToken,List) :- get_next_token(Stream,Token), get_term_list_aux(Token,Stream,EndToken,List).
294
295 get_term_list_aux(EndToken,_,EndToken,List) :- !, List=[].
296 get_term_list_aux(Token,Stream,EndToken,[Term1|TList]) :- get_next_term(Token,Stream,Term1),
297 ? get_next_token(Stream,Token2),!,
298 ? get_term_list_aux2(Token2,Stream,EndToken,TList).
299
300 get_term_list_aux2(EndToken,_,EndToken,List) :- !, List=[].
301 get_term_list_aux2(comma,Stream,EndToken,[Term1|TList]) :-
302 get_next_token(Stream,Token1),
303 get_next_term(Token1,Stream,Term1),!,
304 ? get_next_token(Stream,Token2),
305 ? get_term_list_aux2(Token2,Stream,EndToken,TList).
306 get_term_list_aux2(Token2,_,EndToken,_) :-
307 format(user_error,'Unexpected token: ~w, expected ~w~n',[Token2,EndToken]),fail.
308
309 term_read(File) :- statistics(walltime,_),
310 open(File,read,Stream),read_terms(Stream),close(Stream),
311 statistics(walltime,[_,W1]), format('~n% Walltime ~w ms to tokenise and read ~w~n',[W1,File]),
312 print_memory_used_wo_gc.
313
314 read_terms(Stream) :-
315 read_term_from_stream(Stream,Term),
316 %read(Stream,Term), % Prolog's read
317 write_term(Term,[max_depth(5),numbervars(true)]),nl,
318 !,
319 (Term = 'end_of_file' -> true
320 ; read_terms(Stream)).
321 read_terms(Stream) :-
322 get_code(Stream,C),
323 format('Could not process: ~w~n',[C]).
324
325 read_test(1) :-
326 term_read('/Users/leuschel/git_root/prob_examples/public_examples/B/Demo/Bakery0.prob').
327 read_test(3) :-
328 term_read('/Users/leuschel/git_root/prob_examples/public_examples/B/PerformanceTests/Generated/Generated4000.prob').
329 read_test(4) :-
330 term_read('/Users/leuschel/git_root/prob_examples/public_examples/B/PerformanceTests/Generated/Generated10000.prob').
331
332 read_test(7) :-
333 term_read('/Users/leuschel/git_root/private_examples/ClearSy/2019_Nov/rule_perf4/rule_split.prob').
334
335 read_test(8) :-
336 term_read('/Users/leuschel/git_root/prob_examples/public_examples/B/PerformanceTests/SATLIB/sudoku.prob').
337
338
339
340 /*
341
342 with new read_term_from_stream:
343
344 | ?- read_test(4).
345 parser_version(2019-03-05 17:22:26.861)
346 classical_b(Generated10000,[/Users/leuschel/git_root/prob_examples/public_examples/B/PerformanceTests/Generated/Generated10000.mch])
347 machine(abstract_machine(pos(1,1,1,1,50007,4),machine(pos(2,1,1,1,1,8)),machine_header(pos(3,1,1,9,1,23),Generated10000,[]),[constants(pos(...),[...|...]),properties(pos(...),conjunct(...)),variables(...)|...]))
348 end_of_file
349
350 % Walltime 774 ms to tokenise /Users/leuschel/git_root/prob_examples/public_examples/B/PerformanceTests/Generated/Generated10000.prob
351 108.476 MB
352
353 with fastreadwrite:
354 % Walltime 89 ms to fastread from ./fastrw_out.probz
355 109.004 MB
356
357
358 with Prolog read
359 | ?- rtest4.
360 parser_version(2019-03-05 17:22:26.861)
361 classical_b(Generated10000,[/Users/leuschel/git_root/prob_examples/public_examples/B/PerformanceTests/Generated/Generated10000.mch])
362 machine(abstract_machine(pos(1,1,1,1,50007,4),machine(pos(2,1,1,1,1,8)),machine_header(pos(3,1,1,9,1,23),Generated10000,[]),[constants(pos(...),[...|...]),properties(pos(...),conjunct(...)),variables(...)|...]))
363 end_of_file
364
365 % Walltime 4193 ms to tokenise /Users/leuschel/git_root/prob_examples/public_examples/B/PerformanceTests/Generated/Generated10000.prob
366 286.429 MB
367
368
369 % Walltime 102 ms to get_codes /Users/leuschel/git_root/prob_examples/public_examples/B/PerformanceTests/Generated/Generated4000.prob
370
371 % Walltime 233 ms to tokenise /Users/leuschel/git_root/prob_examples/public_examples/B/PerformanceTests/Generated/Generated4000.prob
372 116.813 MB ( 11.956 MB program)
373
374
375
376 | ?- rtest7.
377 parser_version(23928fd10a7a580ccd609bf614ed4552107bbd6a)
378 classical_b(rule,[/Users/leuschel/git_root/private_examples/ClearSy/2019_Nov/rule_perf4/rule_split.mch,/Users/leuschel/git_root/prob_prolog/stdlib/CHOOSE.def,/Users/leuschel/git_root/prob_prolog/stdlib/LibraryBits.def,/Users/leuschel/git_root/prob_prolog/stdlib/LibraryCSV.def|...])
379 machine(abstract_machine(pos(4687857,17,1,1,79,4),machine(pos(4687858,17,1,1,1,8)),machine_header(pos(4687859,17,1,9,1,21),LibraryFiles,[]),[abstract_constants(pos(...),[...|...]),properties(pos(...),conjunct(...)),definitions(...)|...]))
380 machine(abstract_machine(pos(4688547,18,1,1,94,4),machine(pos(4688548,18,1,1,1,8)),machine_header(pos(4688549,18,1,9,1,20),LibraryMath,[]),[abstract_constants(pos(...),[...|...]),definitions(pos(...),[...|...]),properties(...)|...]))
381 machine(abstract_machine(pos(4689324,19,1,1,176,4),machine(pos(4689325,19,1,1,1,8)),machine_header(pos(4689326,19,1,9,1,23),LibraryStrings,[]),[abstract_constants(pos(...),[...|...]),properties(pos(...),conjunct(...)),definitions(...)|...]))
382 machine(abstract_machine(pos(1,1,1,1,1929826,4),machine(pos(2,1,1,1,1,8)),machine_header(pos(3,1,2,5,2,9),rule,[]),[definitions(pos(...),[...|...]),sees(pos(...),[...|...]),constants(...)|...]))
383 end_of_file
384
385 % Walltime 31964 ms to tokenise /Users/leuschel/git_root/private_examples/ClearSy/2019_Nov/rule_perf4/rule_split.prob
386 659.087 MB
387 yes
388
389 % Walltime 6243 ms to fastread from ./fastrw_out.probz
390 659.025 MB
391
392
393 % Walltime 5676 ms to get_codes /Users/leuschel/git_root/private_examples/ClearSy/2019_Nov/rule_perf4/rule_split.prob
394
395 % Walltime 15616 ms to tokenise /Users/leuschel/git_root/private_examples/ClearSy/2019_Nov/rule_perf4/rule_split.prob
396 116.840 MB ( 11.982 MB program)
397
398 */
399