Skip to content

Commit c4b3834

Browse files
committed
Update - The beginning of rev 7. Initial modification saves approx 1 mins.
1 parent 63600d6 commit c4b3834

File tree

2 files changed

+101
-155
lines changed

2 files changed

+101
-155
lines changed

entries/ikelaiah/README.md

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -113,14 +113,19 @@ Iwan Kelaiah
113113
* Updated the rounding method as per the latest `README.md` in the 1BRC GitHub page.
114114

115115
* 1.5
116-
* Revision release - Sequential approach. 6-8 mins on my Inspiron 15 7510 laptop (No improvements on speed).
117-
* Encapsulate process in a class.
118-
* Updated the rounding method as per the latest `README.md` in the 1BRC GitHub page.
116+
* Revision release - Sequential approach. 6-8 mins on my Inspiron 15 7510 laptop (No improvements on speed).
117+
* Encapsulate process in a class.
118+
* Updated the rounding method as per the latest `README.md` in the 1BRC GitHub page.
119119

120120

121121
* 1.6
122-
* Revision release - Sequential approach. 5-7 mins on my Inspiron 15 7510 laptop (a little improvement on speed).
123-
* Introduced a pointer to the weather record, `PStat` = ^TStat. This saves approx. 30 - 60 seconds.
122+
* Revision release - Sequential approach. 5-7 mins on my Inspiron 15 7510 laptop (a little improvement on speed).
123+
* Introduced a pointer to the weather record, `PStat` = ^TStat. This saves approx. 30 - 60 seconds.
124+
125+
* 1.7
126+
* Revision release - Sequential approach. 4-6 mins on my Inspiron 15 7510 laptop (a little improvement on speed).
127+
* Converting Float as String to Int was a bit slow, so resorted to a lookup instead. This saves 30-55 seconds.
128+
* Re-arranged `if` statements in two places. This saves 10-15 seconds x 2 = ~ 30 seconds saving.
124129

125130
## License
126131

entries/ikelaiah/src/weatherstation.pas

Lines changed: 91 additions & 150 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@ interface
99
, SysUtils
1010
, Math
1111
, streamex
12+
, bufstream
1213
, lgHashMap
14+
, StrUtils
1315
{$IFDEF DEBUG}
1416
, Stopwatch
1517
{$ENDIF}
@@ -35,21 +37,23 @@ TStat = record
3537
PStat = ^TStat;
3638

3739
type
38-
// Create a dictionary, now approx 4 mins faster than Generics.Collections.TDictionary
40+
// Using this dictionary, now approx 4 mins faster than Generics.Collections.TDictionary
3941
TWeatherDictionaryLG = specialize TGHashMapQP<string, PStat>;
4042

43+
type
44+
// a type for storing valid lookup temperature
45+
TValidTemperatureDictionary = specialize TGHashMapQP<string, int64>;
46+
4147
type
4248
// Create a class to encapsulate the temperature observations of each weather station.
4349
TWeatherStation = class
4450
private
4551
fname: string;
4652
weatherDictionary: TWeatherDictionaryLG;
4753
weatherStationList: TStringList;
54+
lookupStrFloatToIntList: TValidTemperatureDictionary;
55+
procedure CreateLookupTemp;
4856
procedure ReadMeasurements;
49-
procedure ReadMeasurementsClassic;
50-
procedure ReadMeasurementsInChunks(const filename: string);
51-
procedure ParseStationAndTempFromChunk(const chunkData: pansichar;
52-
const dataSize: int64; const chunkIndex: int64);
5357
procedure ParseStationAndTemp(const line: string);
5458
procedure AddCityTemperatureLG(const cityName: string; const newTemp: int64);
5559
procedure SortWeatherStationAndStats;
@@ -126,6 +130,8 @@ constructor TWeatherStation.Create(const filename: string);
126130
begin
127131
// Assign filename
128132
fname := filename;
133+
// Create a lookup
134+
self.lookupStrFloatToIntList := TValidTemperatureDictionary.Create;
129135
// Create a dictionary
130136
weatherDictionary := TWeatherDictionaryLG.Create;
131137
// Create a TStringList for sorting
@@ -134,8 +140,12 @@ constructor TWeatherStation.Create(const filename: string);
134140

135141
destructor TWeatherStation.Destroy;
136142
var
137-
stationName:string;
143+
stationName: string;
138144
begin
145+
146+
// Free the lookup dictionary
147+
self.lookupStrFloatToIntList.Free;
148+
139149
// Free TStringList dictionary
140150
weatherStationList.Free;
141151

@@ -147,6 +157,30 @@ destructor TWeatherStation.Destroy;
147157
weatherDictionary.Free;
148158
end;
149159

160+
procedure TWeatherStation.CreateLookupTemp;
161+
var
162+
startTemp: int64 = -1000;
163+
finishTemp: int64 = 1000;
164+
currentTemp: int64;
165+
numStr: string;
166+
begin
167+
168+
currentTemp := startTemp;
169+
170+
while currentTemp <> finishTemp do
171+
begin
172+
self.lookupStrFloatToIntList.Add(formatfloat('0.0', currentTemp / 10), currentTemp);
173+
currentTemp := currentTemp + 1;
174+
end;
175+
176+
{$ifdef DEBUG}
177+
for numStr in self.lookupStrFloatToIntList.Keys do
178+
WriteLn('We have key: ', numStr, ' with value of: ',
179+
IntToStr(self.lookupStrFloatToIntList[numStr]));
180+
Writeln(self.lookupStrFloatToIntList.Count);
181+
{$endif DEBUG}
182+
end;
183+
150184
procedure TWeatherStation.PrintSortedWeatherStationAndStats;
151185
var
152186
outputList: string;
@@ -191,7 +225,7 @@ procedure TWeatherStation.SortWeatherStationAndStats;
191225

192226
wsKey := '';
193227

194-
if self.weatherDictionary.GetCapacity = 0 then
228+
if self.weatherDictionary.Count = 0 then
195229
begin
196230
WriteLn('Nothing to Sort.');
197231
Exit;
@@ -204,7 +238,6 @@ procedure TWeatherStation.SortWeatherStationAndStats;
204238

205239
self.weatherStationList.CustomSort(@CustomTStringListComparer);
206240

207-
208241
{$IFDEF DEBUG}
209242
// Display the line.
210243
WriteLn('Sorting done: ', DateTimeToStr(Now));
@@ -222,14 +255,18 @@ procedure TWeatherStation.AddCityTemperatureLG(const cityName: string;
222255
// Get the temp record
223256
stat := self.weatherDictionary[cityName];
224257

225-
// If the temp lower then min, set the new min.
226-
if newTemp < stat^.min then
227-
stat^.min := newTemp;
228-
229-
// If the temp higher than max, set the new max.
230-
if newTemp > stat^.max then
231-
stat^.max := newTemp;
232-
258+
// Update min and max temps if needed
259+
// Re-arranged the if statement, to achieve minimal if checks.
260+
// This saves approx 15 seconds when processing 1 billion row.
261+
if (newTemp < stat^.min) or (newTemp > stat^.max) then
262+
begin
263+
// If the temp lower then min, set the new min.
264+
if newTemp < stat^.min then
265+
stat^.min := newTemp;
266+
// If the temp higher than max, set the new max.
267+
if newTemp > stat^.max then
268+
stat^.max := newTemp;
269+
end;
233270
// Add count for this city.
234271
stat^.sum := stat^.sum + newTemp;
235272

@@ -242,11 +279,12 @@ procedure TWeatherStation.AddCityTemperatureLG(const cityName: string;
242279
// Display the line.
243280
WriteLn('Updated: ', cityName);
244281
{$ENDIF DEBUG}
245-
end;
246-
247-
// If city name doesn't exist add a new entry
248-
if not self.weatherDictionary.Contains(cityName) then
282+
end
283+
else
249284
begin
285+
// Re-arranged this if portion also to achieve minimal if checks.
286+
// This saves approx 15 seconds when processing 1 billion row.
287+
// If city name doesn't exist add a new entry
250288
New(stat);
251289
stat^.min := newTemp;
252290
stat^.max := newTemp;
@@ -265,13 +303,33 @@ procedure TWeatherStation.AddCityTemperatureLG(const cityName: string;
265303
procedure TWeatherStation.ParseStationAndTemp(const line: string);
266304
var
267305
delimiterPos: integer;
268-
parsedStation, strTemp: string;
306+
parsedStation, strFloatTemp: string;
307+
results: array of string;
269308
parsedTemp, valCode: int64;
270309
begin
271-
272310
// Get position of the delimiter
273311
delimiterPos := Pos(';', line);
274312
if delimiterPos > 0 then
313+
begin
314+
// Get the weather station name
315+
// Using Copy and POS - as suggested by Gemini AI.
316+
// This part saves 3 mins faster when processing 1 billion rows.
317+
//parsedStation := Copy(line, 1, delimiterPos - 1);
318+
strFloatTemp := Copy(line, delimiterPos + 1, Length(line));
319+
320+
// Using a lookup value speeds up 30-45 seconds
321+
if self.lookupStrFloatToIntList.Contains(strFloatTemp) then
322+
begin
323+
parsedTemp := self.lookupStrFloatToIntList[strFloatTemp];
324+
self.AddCityTemperatureLG(Copy(line, 1, delimiterPos - 1),
325+
parsedTemp);
326+
end;
327+
328+
end;
329+
330+
{// Get position of the delimiter
331+
delimiterPos := Pos(';', line);
332+
if delimiterPos > 0 then
275333
begin
276334
// Get the weather station name
277335
// Using Copy and POS - as suggested by Gemini AI.
@@ -280,21 +338,24 @@ procedure TWeatherStation.ParseStationAndTemp(const line: string);
280338
281339
// Get the temperature recorded, as string, remove '.' from string float
282340
// because we want to save it as int64.
283-
strTemp := Copy(line, delimiterPos + 1, Length(line));
284-
// strTemp := StringReplace(strTemp, '.', '', [rfReplaceAll]);
341+
strFloatTemp := Copy(line, delimiterPos + 1, Length(line));
342+
343+
// strFloatTemp := StringReplace(strFloatTemp, '.', '', [rfReplaceAll]);
285344
// The above operation is a bit expensive.
286345
// Rewrote a simple function which prevents creation of new string
287346
// in each iteration. Saved approx 20-30 seconds for 1 billion row.
288347
// Remove dots turns a float into an int.
289-
strTemp := RemoveDots(strTemp);
348+
strFloatTemp := RemoveDots(strFloatTemp);
290349
291350
// Add the weather station and the recorded temp (as int64) in the TDictionary
292-
Val(strTemp, parsedTemp, valCode);
351+
Val(strFloatTemp,
352+
parsedTemp,
353+
valCode);
293354
if valCode <> 0 then Exit;
294355
295356
// Add a record in TWeatherDictionary
296357
self.AddCityTemperatureLG(parsedStation, parsedTemp);
297-
end;
358+
end;}
298359
end;
299360

300361
procedure TWeatherStation.ReadMeasurements;
@@ -312,7 +373,7 @@ procedure TWeatherStation.ReadMeasurements;
312373
// Read and parse chunks of data until EOF -------------------------------
313374
while not streamReader.EOF do
314375
begin
315-
//line := streamReader.ReadLine;
376+
// line := streamReader.ReadLine;
316377
self.ParseStationAndTemp(streamReader.ReadLine);
317378
end;// End of read and parse chunks of data ------------------------------
318379
finally
@@ -324,132 +385,12 @@ procedure TWeatherStation.ReadMeasurements;
324385
end;
325386
end;
326387

327-
procedure TWeatherStation.ReadMeasurementsClassic;
328-
var
329-
inputFile: System.TextFile;
330-
textBuffer: array[1..131072] of byte;
331-
line: string;
332-
begin
333-
334-
// Open the file for reading
335-
AssignFile(inputFile, self.fname);
336-
SetTextBuf(inputFile, textBuffer);
337-
try
338-
Reset(inputFile);
339-
340-
// Read and parse chunks of data until EOF -------------------------------
341-
while not EOF(inputFile) do
342-
begin
343-
ReadLn(inputFile, line);
344-
self.ParseStationAndTemp(line);
345-
end;// End of read and parse chunks of data ------------------------------
346-
347-
finally
348-
// Close the file
349-
CloseFile(inputFile);
350-
end;
351-
end;
352-
353-
procedure TWeatherStation.ParseStationAndTempFromChunk(const chunkData: pansichar;
354-
const dataSize: int64; const chunkIndex: int64);
355-
var
356-
index, lineStart, lineLength: int64;
357-
begin
358-
lineStart := 0;
359-
360-
// Check for Line Feed (LF)
361-
for index := 0 to dataSize - 1 do
362-
begin
363-
if chunkData[index] = #10 then
364-
begin
365-
366-
lineLength := index - lineStart;
367-
368-
// Remove potential CR before LF (for Windows)
369-
if (chunkData[index - 1] = #13) and (index < dataSize - 1) then
370-
Dec(LineLength);
371-
372-
// The current line is now: Buffer[LineStart..LineStart+LineLength-1]
373-
// WriteLn(chunkData[lineStart..lineStart + lineLength - 1], '.');
374-
self.ParseStationAndTemp(chunkData[lineStart..lineStart + lineLength - 1]);
375-
// Skip to the next 'line' in the buffer
376-
lineStart := index + 1;
377-
end;
378-
end;
379-
end;
380-
381-
procedure TWeatherStation.ReadMeasurementsInChunks(const filename: string);
382-
const
383-
defaultChunkSize: int64 = 67108864; // 64MB in bytes
384-
var
385-
fileStream: TFileStream;
386-
buffer: pansichar;
387-
bytesRead, totalBytesRead, chunkSize, lineBreakPos, chunkIndex: int64;
388-
begin
389-
390-
chunkSize := defaultChunkSize * 4; // 256MB in bytes
391-
392-
// Open the file for reading
393-
fileStream := TFileStream.Create(filename, fmOpenRead or fmShareDenyWrite);
394-
try
395-
// Allocate memory buffer for reading chunks
396-
// Ref: https://www.freepascal.org/docs-html/rtl/system/getmem.html
397-
GetMem(buffer, chunkSize);
398-
try
399-
totalBytesRead := 0;
400-
chunkIndex := 0;
401-
402-
// Read and parse chunks of data until EOF
403-
while totalBytesRead < fileStream.Size do
404-
begin
405-
{$IFDEF DEBUG}
406-
WriteLn('Processing chunk index: ', IntToStr(chunkIndex));
407-
{$ENDIF DEBUG}
408-
409-
bytesRead := fileStream.Read(buffer^, chunkSize);
410-
411-
// Update total bytes read
412-
Inc(totalBytesRead, bytesRead);
413-
414-
// Find the position of the last newline character in the chunk
415-
lineBreakPos := BytesRead;
416-
while (lineBreakPos > 0) and (Buffer[lineBreakPos - 1] <> #10) do
417-
Dec(lineBreakPos);
418-
419-
{ Now, must ensure that if the last byte read in the current chunk
420-
is not a newline character, the file pointer is moved back to include
421-
that byte and any preceding bytes of the partial line in the next
422-
chunk's read operation.
423-
424-
Also, no need to update the BytesRead variable in this context because
425-
it represents the actual number of bytes read from the file, including
426-
any partial line that may have been included due to moving the file
427-
pointer back.
428-
Ref: https://www.freepascal.org/docs-html/rtl/classes/tstream.seek.html}
429-
if lineBreakPos < bytesRead then
430-
fileStream.Seek(-(bytesRead - lineBreakPos), soCurrent);
431-
432-
// Parse the buffer line by line here
433-
// This is to slow!
434-
self.ParseStationAndTempFromChunk(buffer, lineBreakPos, chunkIndex);
435-
436-
// Increase chunk index - a counter
437-
Inc(chunkIndex);
438-
end;
439-
finally
440-
// Free the memory buffer
441-
FreeMem(buffer);
442-
end;
443-
finally
444-
// Close the file
445-
fileStream.Free;
446-
end;
447-
end;
448-
449388
// The main algorithm
450389
procedure TWeatherStation.ProcessMeasurements;
451390
begin
391+
self.CreateLookupTemp;
452392
self.ReadMeasurements;
393+
// self.ReadMeasurementsBuf;
453394
// self.ReadMeasurementsClassic;
454395
{This chunking method cuts ~ 30 - 40 seconds of processing time from ~6.45 to 6.00
455396
But the SHA256 at the end is incorrect}

0 commit comments

Comments
 (0)