Skip to content

Commit 63600d6

Browse files
authored
Merge pull request #88 from ikelaiah/ikelaiah-rev06
Ikelaiah rev06
2 parents 9174964 + e5c01ec commit 63600d6

File tree

2 files changed

+204
-35
lines changed

2 files changed

+204
-35
lines changed

entries/ikelaiah/README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,11 @@ Iwan Kelaiah
117117
* Encapsulate process in a class.
118118
* Updated the rounding method as per the latest `README.md` in the 1BRC GitHub page.
119119

120+
121+
* 1.6
122+
* Revision release - Sequential approach. 5-7 mins on my Inspiron 15 7510 laptop (a little improvement on speed).
123+
* Introduced a pointer to the weather record, `PStat` = ^TStat. This saves approx. 30 - 60 seconds.
124+
120125
## License
121126

122127
This project is licensed under the MIT License - see the LICENSE.md file for details

entries/ikelaiah/src/weatherstation.pas

Lines changed: 199 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -5,41 +5,38 @@
55
interface
66

77
uses
8-
Classes,
9-
SysUtils,
10-
Math,
11-
streamex,
12-
lgHashMap
8+
Classes
9+
, SysUtils
10+
, Math
11+
, streamex
12+
, lgHashMap
1313
{$IFDEF DEBUG}
1414
, Stopwatch
1515
{$ENDIF}
16-
, Baseline.Common
17-
;
16+
, Baseline.Common;
1817

1918
type
20-
TParsedData = record
21-
wsName: string;
22-
wsTemp: int64;
23-
end;
19+
{ Create a record of temperature stats.
2420
25-
type
26-
// Create a record of temperature stats.
21+
Borrowed the concept from go's approach to improve performance, save floats as int64.
22+
This saved ~2 mins processing time for processing 1 billion rows.}
2723
TStat = record
2824
var
29-
min: int64; // Borrowed the concept from go's approach to improve
30-
// performance, save floats as int64.
31-
max: int64; // This saved ~2 mins processing time.
25+
min: int64;
26+
max: int64;
3227
sum: int64;
3328
cnt: int64;
3429
public
3530
constructor Create(const newMin: int64; const newMax: int64;
3631
const newSum: int64; const newCount: int64);
3732
function ToString: string;
3833
end;
34+
{Using pointer to TStat saves approx. 30-60 seconds for processing 1 billion rows}
35+
PStat = ^TStat;
3936

4037
type
41-
// Create a dictionary
42-
TWeatherDictionaryLG = specialize TGHashMapQP<string, TStat>;
38+
// Create a dictionary, now approx 4 mins faster than Generics.Collections.TDictionary
39+
TWeatherDictionaryLG = specialize TGHashMapQP<string, PStat>;
4340

4441
type
4542
// Create a class to encapsulate the temperature observations of each weather station.
@@ -49,6 +46,10 @@ TWeatherStation = class
4946
weatherDictionary: TWeatherDictionaryLG;
5047
weatherStationList: TStringList;
5148
procedure ReadMeasurements;
49+
procedure ReadMeasurementsClassic;
50+
procedure ReadMeasurementsInChunks(const filename: string);
51+
procedure ParseStationAndTempFromChunk(const chunkData: pansichar;
52+
const dataSize: int64; const chunkIndex: int64);
5253
procedure ParseStationAndTemp(const line: string);
5354
procedure AddCityTemperatureLG(const cityName: string; const newTemp: int64);
5455
procedure SortWeatherStationAndStats;
@@ -132,10 +133,17 @@ constructor TWeatherStation.Create(const filename: string);
132133
end;
133134

134135
destructor TWeatherStation.Destroy;
136+
var
137+
stationName:string;
135138
begin
136-
// Free TStringLIst dictionary
139+
// Free TStringList dictionary
137140
weatherStationList.Free;
138-
// Free the dictionary
141+
142+
// Free the dictionary - 1. Free PStat first
143+
for stationName in self.weatherDictionary.Keys do
144+
Dispose(PStat(self.weatherDictionary.Items[stationName]));
145+
146+
// Free the dictionary - 2. Finally free the container itself
139147
weatherDictionary.Free;
140148
end;
141149

@@ -145,6 +153,11 @@ procedure TWeatherStation.PrintSortedWeatherStationAndStats;
145153
index: int64;
146154
begin
147155

156+
{$IFDEF DEBUG}
157+
// Display the line.
158+
WriteLn('Printing now: ', DateTimeToStr(Now));
159+
{$ENDIF DEBUG}
160+
148161
if self.weatherStationList.Count = 0 then
149162
begin
150163
WriteLn('Nothing to print. The list is empty.');
@@ -159,31 +172,49 @@ procedure TWeatherStation.PrintSortedWeatherStationAndStats;
159172
// Remove last comma and space; ', ', a neat trick from Gus.
160173
SetLength(outputList, Length(outputList) - 2);
161174
WriteLn('{', outputList, '}');
175+
176+
{$IFDEF DEBUG}
177+
// Display the line.
178+
WriteLn('Printing done: ', DateTimeToStr(Now));
179+
{$ENDIF DEBUG}
162180
end;
163181

164182
procedure TWeatherStation.SortWeatherStationAndStats;
165183
var
166184
wsKey: string;
167185
begin
186+
187+
{$IFDEF DEBUG}
188+
// Display the line.
189+
WriteLn('Sorting now: ', DateTimeToStr(Now));
190+
{$ENDIF DEBUG}
191+
168192
wsKey := '';
169193

170194
if self.weatherDictionary.GetCapacity = 0 then
171195
begin
172196
WriteLn('Nothing to Sort.');
173197
Exit;
174198
end;
199+
175200
for wsKey in weatherDictionary.Keys do
176201
begin
177-
self.weatherStationList.Add(wsKey + '=' + weatherDictionary[wsKey].ToString + ', ');
202+
self.weatherStationList.Add(wsKey + '=' + weatherDictionary[wsKey]^.ToString + ', ');
178203
end;
179204

180205
self.weatherStationList.CustomSort(@CustomTStringListComparer);
206+
207+
208+
{$IFDEF DEBUG}
209+
// Display the line.
210+
WriteLn('Sorting done: ', DateTimeToStr(Now));
211+
{$ENDIF DEBUG}
181212
end;
182213

183214
procedure TWeatherStation.AddCityTemperatureLG(const cityName: string;
184215
const newTemp: int64);
185216
var
186-
stat: TStat;
217+
stat: PStat;
187218
begin
188219
// If city name esxists, modify temp as needed
189220
if self.weatherDictionary.Contains(cityName) then
@@ -192,34 +223,41 @@ procedure TWeatherStation.AddCityTemperatureLG(const cityName: string;
192223
stat := self.weatherDictionary[cityName];
193224

194225
// If the temp lower then min, set the new min.
195-
if newTemp < stat.min then
196-
stat.min := newTemp;
226+
if newTemp < stat^.min then
227+
stat^.min := newTemp;
197228

198229
// If the temp higher than max, set the new max.
199-
if newTemp > stat.max then
200-
stat.max := newTemp;
230+
if newTemp > stat^.max then
231+
stat^.max := newTemp;
201232

202233
// Add count for this city.
203-
stat.sum := stat.sum + newTemp;
234+
stat^.sum := stat^.sum + newTemp;
204235

205236
// Increase the counter
206-
stat.cnt := stat.cnt + 1;
237+
stat^.cnt := stat^.cnt + 1;
207238

208239
// Update the stat of this city
209-
self.weatherDictionary.AddOrSetValue(cityName, stat);
240+
// self.weatherDictionary.AddOrSetValue(cityName, stat);
210241
{$IFDEF DEBUG}
211242
// Display the line.
212-
// WriteLn('Updated: ', cityName);
243+
WriteLn('Updated: ', cityName);
213244
{$ENDIF DEBUG}
214245
end;
215246

216247
// If city name doesn't exist add a new entry
217248
if not self.weatherDictionary.Contains(cityName) then
218249
begin
219-
self.weatherDictionary.Add(cityName, TStat.Create(newTemp, newTemp, newTemp, 1));
250+
New(stat);
251+
stat^.min := newTemp;
252+
stat^.max := newTemp;
253+
stat^.sum := newTemp;
254+
stat^.cnt := 1;
255+
self.weatherDictionary.Add(cityName, stat);
256+
220257
{$IFDEF DEBUG}
221258
// Display the line.
222-
// WriteLn('Added: ', cityName);
259+
WriteLn('weatherDictionary count: ', inttostr(self.weatherDictionary.Count));
260+
WriteLn('Added: ', cityName);
223261
{$ENDIF DEBUG}
224262
end;
225263
end;
@@ -269,13 +307,13 @@ procedure TWeatherStation.ReadMeasurements;
269307
// Open the file for reading
270308
fileStream := TFileStream.Create(self.fname, fmOpenRead or fmShareDenyNone);
271309
try
272-
streamReader := TStreamReader.Create(fileStream);
310+
streamReader := TStreamReader.Create(fileStream, 65536 * 2, False);
273311
try
274312
// Read and parse chunks of data until EOF -------------------------------
275313
while not streamReader.EOF do
276314
begin
277-
line := streamReader.ReadLine;
278-
self.ParseStationAndTemp(line);
315+
//line := streamReader.ReadLine;
316+
self.ParseStationAndTemp(streamReader.ReadLine);
279317
end;// End of read and parse chunks of data ------------------------------
280318
finally
281319
streamReader.Free;
@@ -286,10 +324,136 @@ procedure TWeatherStation.ReadMeasurements;
286324
end;
287325
end;
288326

327+
procedure TWeatherStation.ReadMeasurementsClassic;
328+
var
329+
inputFile: System.TextFile;
330+
textBuffer: array[1..131072] of byte;
331+
line: string;
332+
begin
333+
334+
// Open the file for reading
335+
AssignFile(inputFile, self.fname);
336+
SetTextBuf(inputFile, textBuffer);
337+
try
338+
Reset(inputFile);
339+
340+
// Read and parse chunks of data until EOF -------------------------------
341+
while not EOF(inputFile) do
342+
begin
343+
ReadLn(inputFile, line);
344+
self.ParseStationAndTemp(line);
345+
end;// End of read and parse chunks of data ------------------------------
346+
347+
finally
348+
// Close the file
349+
CloseFile(inputFile);
350+
end;
351+
end;
352+
353+
procedure TWeatherStation.ParseStationAndTempFromChunk(const chunkData: pansichar;
354+
const dataSize: int64; const chunkIndex: int64);
355+
var
356+
index, lineStart, lineLength: int64;
357+
begin
358+
lineStart := 0;
359+
360+
// Check for Line Feed (LF)
361+
for index := 0 to dataSize - 1 do
362+
begin
363+
if chunkData[index] = #10 then
364+
begin
365+
366+
lineLength := index - lineStart;
367+
368+
// Remove potential CR before LF (for Windows)
369+
if (chunkData[index - 1] = #13) and (index < dataSize - 1) then
370+
Dec(LineLength);
371+
372+
// The current line is now: Buffer[LineStart..LineStart+LineLength-1]
373+
// WriteLn(chunkData[lineStart..lineStart + lineLength - 1], '.');
374+
self.ParseStationAndTemp(chunkData[lineStart..lineStart + lineLength - 1]);
375+
// Skip to the next 'line' in the buffer
376+
lineStart := index + 1;
377+
end;
378+
end;
379+
end;
380+
381+
procedure TWeatherStation.ReadMeasurementsInChunks(const filename: string);
382+
const
383+
defaultChunkSize: int64 = 67108864; // 64MB in bytes
384+
var
385+
fileStream: TFileStream;
386+
buffer: pansichar;
387+
bytesRead, totalBytesRead, chunkSize, lineBreakPos, chunkIndex: int64;
388+
begin
389+
390+
chunkSize := defaultChunkSize * 4; // 256MB in bytes
391+
392+
// Open the file for reading
393+
fileStream := TFileStream.Create(filename, fmOpenRead or fmShareDenyWrite);
394+
try
395+
// Allocate memory buffer for reading chunks
396+
// Ref: https://www.freepascal.org/docs-html/rtl/system/getmem.html
397+
GetMem(buffer, chunkSize);
398+
try
399+
totalBytesRead := 0;
400+
chunkIndex := 0;
401+
402+
// Read and parse chunks of data until EOF
403+
while totalBytesRead < fileStream.Size do
404+
begin
405+
{$IFDEF DEBUG}
406+
WriteLn('Processing chunk index: ', IntToStr(chunkIndex));
407+
{$ENDIF DEBUG}
408+
409+
bytesRead := fileStream.Read(buffer^, chunkSize);
410+
411+
// Update total bytes read
412+
Inc(totalBytesRead, bytesRead);
413+
414+
// Find the position of the last newline character in the chunk
415+
lineBreakPos := BytesRead;
416+
while (lineBreakPos > 0) and (Buffer[lineBreakPos - 1] <> #10) do
417+
Dec(lineBreakPos);
418+
419+
{ Now, must ensure that if the last byte read in the current chunk
420+
is not a newline character, the file pointer is moved back to include
421+
that byte and any preceding bytes of the partial line in the next
422+
chunk's read operation.
423+
424+
Also, no need to update the BytesRead variable in this context because
425+
it represents the actual number of bytes read from the file, including
426+
any partial line that may have been included due to moving the file
427+
pointer back.
428+
Ref: https://www.freepascal.org/docs-html/rtl/classes/tstream.seek.html}
429+
if lineBreakPos < bytesRead then
430+
fileStream.Seek(-(bytesRead - lineBreakPos), soCurrent);
431+
432+
// Parse the buffer line by line here
433+
// This is to slow!
434+
self.ParseStationAndTempFromChunk(buffer, lineBreakPos, chunkIndex);
435+
436+
// Increase chunk index - a counter
437+
Inc(chunkIndex);
438+
end;
439+
finally
440+
// Free the memory buffer
441+
FreeMem(buffer);
442+
end;
443+
finally
444+
// Close the file
445+
fileStream.Free;
446+
end;
447+
end;
448+
289449
// The main algorithm
290450
procedure TWeatherStation.ProcessMeasurements;
291451
begin
292452
self.ReadMeasurements;
453+
// self.ReadMeasurementsClassic;
454+
{This chunking method cuts ~ 30 - 40 seconds of processing time from ~6.45 to 6.00
455+
But the SHA256 at the end is incorrect}
456+
// self.ReadMeasurementsInChunks(self.fname);
293457
self.SortWeatherStationAndStats;
294458
self.PrintSortedWeatherStationAndStats;
295459
end;

0 commit comments

Comments
 (0)