55interface
66
77uses
8- Classes,
9- SysUtils,
10- Math,
11- streamex,
12- lgHashMap
8+ Classes
9+ , SysUtils
10+ , Math
11+ , streamex
12+ , lgHashMap
1313 { $IFDEF DEBUG}
1414 , Stopwatch
1515 { $ENDIF}
16- , Baseline.Common
17- ;
16+ , Baseline.Common;
1817
1918type
20- TParsedData = record
21- wsName: string;
22- wsTemp: int64;
23- end ;
19+ { Create a record of temperature stats.
2420
25- type
26- // Create a record of temperature stats.
21+ Borrowed the concept from go's approach to improve performance, save floats as int64.
22+ This saved ~2 mins processing time for processing 1 billion rows. }
2723 TStat = record
2824 var
29- min: int64; // Borrowed the concept from go's approach to improve
30- // performance, save floats as int64.
31- max: int64; // This saved ~2 mins processing time.
25+ min: int64;
26+ max: int64;
3227 sum: int64;
3328 cnt: int64;
3429 public
3530 constructor Create(const newMin: int64; const newMax: int64;
3631 const newSum: int64; const newCount: int64);
3732 function ToString : string;
3833 end ;
34+ { Using pointer to TStat saves approx. 30-60 seconds for processing 1 billion rows}
35+ PStat = ^TStat;
3936
4037type
41- // Create a dictionary
42- TWeatherDictionaryLG = specialize TGHashMapQP<string, TStat >;
38+ // Create a dictionary, now approx 4 mins faster than Generics.Collections.TDictionary
39+ TWeatherDictionaryLG = specialize TGHashMapQP<string, PStat >;
4340
4441type
4542 // Create a class to encapsulate the temperature observations of each weather station.
@@ -49,6 +46,10 @@ TWeatherStation = class
4946 weatherDictionary: TWeatherDictionaryLG;
5047 weatherStationList: TStringList;
5148 procedure ReadMeasurements ;
49+ procedure ReadMeasurementsClassic ;
50+ procedure ReadMeasurementsInChunks (const filename: string);
51+ procedure ParseStationAndTempFromChunk (const chunkData: pansichar;
52+ const dataSize: int64; const chunkIndex: int64);
5253 procedure ParseStationAndTemp (const line: string);
5354 procedure AddCityTemperatureLG (const cityName: string; const newTemp: int64);
5455 procedure SortWeatherStationAndStats ;
@@ -132,10 +133,17 @@ constructor TWeatherStation.Create(const filename: string);
132133end ;
133134
134135destructor TWeatherStation.Destroy;
136+ var
137+ stationName:string;
135138begin
136- // Free TStringLIst dictionary
139+ // Free TStringList dictionary
137140 weatherStationList.Free;
138- // Free the dictionary
141+
142+ // Free the dictionary - 1. Free PStat first
143+ for stationName in self.weatherDictionary.Keys do
144+ Dispose(PStat(self.weatherDictionary.Items[stationName]));
145+
146+ // Free the dictionary - 2. Finally free the container itself
139147 weatherDictionary.Free;
140148end ;
141149
@@ -145,6 +153,11 @@ procedure TWeatherStation.PrintSortedWeatherStationAndStats;
145153 index: int64;
146154begin
147155
156+ { $IFDEF DEBUG}
157+ // Display the line.
158+ WriteLn(' Printing now: ' , DateTimeToStr(Now));
159+ { $ENDIF DEBUG}
160+
148161 if self.weatherStationList.Count = 0 then
149162 begin
150163 WriteLn(' Nothing to print. The list is empty.' );
@@ -159,31 +172,49 @@ procedure TWeatherStation.PrintSortedWeatherStationAndStats;
159172 // Remove last comma and space; ', ', a neat trick from Gus.
160173 SetLength(outputList, Length(outputList) - 2 );
161174 WriteLn(' {' , outputList, ' }' );
175+
176+ { $IFDEF DEBUG}
177+ // Display the line.
178+ WriteLn(' Printing done: ' , DateTimeToStr(Now));
179+ { $ENDIF DEBUG}
162180end ;
163181
164182procedure TWeatherStation.SortWeatherStationAndStats ;
165183var
166184 wsKey: string;
167185begin
186+
187+ { $IFDEF DEBUG}
188+ // Display the line.
189+ WriteLn(' Sorting now: ' , DateTimeToStr(Now));
190+ { $ENDIF DEBUG}
191+
168192 wsKey := ' ' ;
169193
170194 if self.weatherDictionary.GetCapacity = 0 then
171195 begin
172196 WriteLn(' Nothing to Sort.' );
173197 Exit;
174198 end ;
199+
175200 for wsKey in weatherDictionary.Keys do
176201 begin
177- self.weatherStationList.Add(wsKey + ' =' + weatherDictionary[wsKey].ToString + ' , ' );
202+ self.weatherStationList.Add(wsKey + ' =' + weatherDictionary[wsKey]^ .ToString + ' , ' );
178203 end ;
179204
180205 self.weatherStationList.CustomSort(@CustomTStringListComparer);
206+
207+
208+ { $IFDEF DEBUG}
209+ // Display the line.
210+ WriteLn(' Sorting done: ' , DateTimeToStr(Now));
211+ { $ENDIF DEBUG}
181212end ;
182213
183214procedure TWeatherStation.AddCityTemperatureLG (const cityName: string;
184215 const newTemp: int64);
185216var
186- stat: TStat ;
217+ stat: PStat ;
187218begin
188219 // If city name esxists, modify temp as needed
189220 if self.weatherDictionary.Contains(cityName) then
@@ -192,34 +223,41 @@ procedure TWeatherStation.AddCityTemperatureLG(const cityName: string;
192223 stat := self.weatherDictionary[cityName];
193224
194225 // If the temp lower then min, set the new min.
195- if newTemp < stat.min then
196- stat.min := newTemp;
226+ if newTemp < stat^ .min then
227+ stat^ .min := newTemp;
197228
198229 // If the temp higher than max, set the new max.
199- if newTemp > stat.max then
200- stat.max := newTemp;
230+ if newTemp > stat^ .max then
231+ stat^ .max := newTemp;
201232
202233 // Add count for this city.
203- stat.sum := stat.sum + newTemp;
234+ stat^ .sum := stat^ .sum + newTemp;
204235
205236 // Increase the counter
206- stat.cnt := stat.cnt + 1 ;
237+ stat^ .cnt := stat^ .cnt + 1 ;
207238
208239 // Update the stat of this city
209- self.weatherDictionary.AddOrSetValue(cityName, stat);
240+ // self.weatherDictionary.AddOrSetValue(cityName, stat);
210241 { $IFDEF DEBUG}
211242 // Display the line.
212- // WriteLn('Updated: ', cityName);
243+ WriteLn(' Updated: ' , cityName);
213244 { $ENDIF DEBUG}
214245 end ;
215246
216247 // If city name doesn't exist add a new entry
217248 if not self.weatherDictionary.Contains(cityName) then
218249 begin
219- self.weatherDictionary.Add(cityName, TStat.Create(newTemp, newTemp, newTemp, 1 ));
250+ New(stat);
251+ stat^.min := newTemp;
252+ stat^.max := newTemp;
253+ stat^.sum := newTemp;
254+ stat^.cnt := 1 ;
255+ self.weatherDictionary.Add(cityName, stat);
256+
220257 { $IFDEF DEBUG}
221258 // Display the line.
222- // WriteLn('Added: ', cityName);
259+ WriteLn(' weatherDictionary count: ' , inttostr(self.weatherDictionary.Count));
260+ WriteLn(' Added: ' , cityName);
223261 { $ENDIF DEBUG}
224262 end ;
225263end ;
@@ -269,13 +307,13 @@ procedure TWeatherStation.ReadMeasurements;
269307 // Open the file for reading
270308 fileStream := TFileStream.Create(self.fname, fmOpenRead or fmShareDenyNone);
271309 try
272- streamReader := TStreamReader.Create(fileStream);
310+ streamReader := TStreamReader.Create(fileStream, 65536 * 2 , False );
273311 try
274312 // Read and parse chunks of data until EOF -------------------------------
275313 while not streamReader.EOF do
276314 begin
277- line := streamReader.ReadLine;
278- self.ParseStationAndTemp(line );
315+ // line := streamReader.ReadLine;
316+ self.ParseStationAndTemp(streamReader.ReadLine );
279317 end ;// End of read and parse chunks of data ------------------------------
280318 finally
281319 streamReader.Free;
@@ -286,10 +324,136 @@ procedure TWeatherStation.ReadMeasurements;
286324 end ;
287325end ;
288326
327+ procedure TWeatherStation.ReadMeasurementsClassic ;
328+ var
329+ inputFile: System.TextFile;
330+ textBuffer: array [1 ..131072 ] of byte;
331+ line: string;
332+ begin
333+
334+ // Open the file for reading
335+ AssignFile(inputFile, self.fname);
336+ SetTextBuf(inputFile, textBuffer);
337+ try
338+ Reset(inputFile);
339+
340+ // Read and parse chunks of data until EOF -------------------------------
341+ while not EOF(inputFile) do
342+ begin
343+ ReadLn(inputFile, line);
344+ self.ParseStationAndTemp(line);
345+ end ;// End of read and parse chunks of data ------------------------------
346+
347+ finally
348+ // Close the file
349+ CloseFile(inputFile);
350+ end ;
351+ end ;
352+
353+ procedure TWeatherStation.ParseStationAndTempFromChunk (const chunkData: pansichar;
354+ const dataSize: int64; const chunkIndex: int64);
355+ var
356+ index, lineStart, lineLength: int64;
357+ begin
358+ lineStart := 0 ;
359+
360+ // Check for Line Feed (LF)
361+ for index := 0 to dataSize - 1 do
362+ begin
363+ if chunkData[index] = #10 then
364+ begin
365+
366+ lineLength := index - lineStart;
367+
368+ // Remove potential CR before LF (for Windows)
369+ if (chunkData[index - 1 ] = #13 ) and (index < dataSize - 1 ) then
370+ Dec(LineLength);
371+
372+ // The current line is now: Buffer[LineStart..LineStart+LineLength-1]
373+ // WriteLn(chunkData[lineStart..lineStart + lineLength - 1], '.');
374+ self.ParseStationAndTemp(chunkData[lineStart..lineStart + lineLength - 1 ]);
375+ // Skip to the next 'line' in the buffer
376+ lineStart := index + 1 ;
377+ end ;
378+ end ;
379+ end ;
380+
381+ procedure TWeatherStation.ReadMeasurementsInChunks (const filename: string);
382+ const
383+ defaultChunkSize: int64 = 67108864 ; // 64MB in bytes
384+ var
385+ fileStream: TFileStream;
386+ buffer: pansichar;
387+ bytesRead, totalBytesRead, chunkSize, lineBreakPos, chunkIndex: int64;
388+ begin
389+
390+ chunkSize := defaultChunkSize * 4 ; // 256MB in bytes
391+
392+ // Open the file for reading
393+ fileStream := TFileStream.Create(filename, fmOpenRead or fmShareDenyWrite);
394+ try
395+ // Allocate memory buffer for reading chunks
396+ // Ref: https://www.freepascal.org/docs-html/rtl/system/getmem.html
397+ GetMem(buffer, chunkSize);
398+ try
399+ totalBytesRead := 0 ;
400+ chunkIndex := 0 ;
401+
402+ // Read and parse chunks of data until EOF
403+ while totalBytesRead < fileStream.Size do
404+ begin
405+ { $IFDEF DEBUG}
406+ WriteLn(' Processing chunk index: ' , IntToStr(chunkIndex));
407+ { $ENDIF DEBUG}
408+
409+ bytesRead := fileStream.Read(buffer^, chunkSize);
410+
411+ // Update total bytes read
412+ Inc(totalBytesRead, bytesRead);
413+
414+ // Find the position of the last newline character in the chunk
415+ lineBreakPos := BytesRead;
416+ while (lineBreakPos > 0 ) and (Buffer[lineBreakPos - 1 ] <> #10 ) do
417+ Dec(lineBreakPos);
418+
419+ { Now, must ensure that if the last byte read in the current chunk
420+ is not a newline character, the file pointer is moved back to include
421+ that byte and any preceding bytes of the partial line in the next
422+ chunk's read operation.
423+
424+ Also, no need to update the BytesRead variable in this context because
425+ it represents the actual number of bytes read from the file, including
426+ any partial line that may have been included due to moving the file
427+ pointer back.
428+ Ref: https://www.freepascal.org/docs-html/rtl/classes/tstream.seek.html}
429+ if lineBreakPos < bytesRead then
430+ fileStream.Seek(-(bytesRead - lineBreakPos), soCurrent);
431+
432+ // Parse the buffer line by line here
433+ // This is to slow!
434+ self.ParseStationAndTempFromChunk(buffer, lineBreakPos, chunkIndex);
435+
436+ // Increase chunk index - a counter
437+ Inc(chunkIndex);
438+ end ;
439+ finally
440+ // Free the memory buffer
441+ FreeMem(buffer);
442+ end ;
443+ finally
444+ // Close the file
445+ fileStream.Free;
446+ end ;
447+ end ;
448+
289449// The main algorithm
290450procedure TWeatherStation.ProcessMeasurements ;
291451begin
292452 self.ReadMeasurements;
453+ // self.ReadMeasurementsClassic;
454+ { This chunking method cuts ~ 30 - 40 seconds of processing time from ~6.45 to 6.00
455+ But the SHA256 at the end is incorrect}
456+ // self.ReadMeasurementsInChunks(self.fname);
293457 self.SortWeatherStationAndStats;
294458 self.PrintSortedWeatherStationAndStats;
295459end ;
0 commit comments