@@ -9,7 +9,9 @@ interface
99 , SysUtils
1010 , Math
1111 , streamex
12+ , bufstream
1213 , lgHashMap
14+ , StrUtils
1315 { $IFDEF DEBUG}
1416 , Stopwatch
1517 { $ENDIF}
@@ -35,21 +37,23 @@ TStat = record
3537 PStat = ^TStat;
3638
3739type
38- // Create a dictionary, now approx 4 mins faster than Generics.Collections.TDictionary
40+ // Using this dictionary, now approx 4 mins faster than Generics.Collections.TDictionary
3941 TWeatherDictionaryLG = specialize TGHashMapQP<string, PStat>;
4042
43+ type
44+ // a type for storing valid lookup temperature
45+ TValidTemperatureDictionary = specialize TGHashMapQP<string, int64>;
46+
4147type
4248 // Create a class to encapsulate the temperature observations of each weather station.
4349 TWeatherStation = class
4450 private
4551 fname: string;
4652 weatherDictionary: TWeatherDictionaryLG;
4753 weatherStationList: TStringList;
54+ lookupStrFloatToIntList: TValidTemperatureDictionary;
55+ procedure CreateLookupTemp ;
4856 procedure ReadMeasurements ;
49- procedure ReadMeasurementsClassic ;
50- procedure ReadMeasurementsInChunks (const filename: string);
51- procedure ParseStationAndTempFromChunk (const chunkData: pansichar;
52- const dataSize: int64; const chunkIndex: int64);
5357 procedure ParseStationAndTemp (const line: string);
5458 procedure AddCityTemperatureLG (const cityName: string; const newTemp: int64);
5559 procedure SortWeatherStationAndStats ;
@@ -126,6 +130,8 @@ constructor TWeatherStation.Create(const filename: string);
126130begin
127131 // Assign filename
128132 fname := filename;
133+ // Create a lookup
134+ self.lookupStrFloatToIntList := TValidTemperatureDictionary.Create;
129135 // Create a dictionary
130136 weatherDictionary := TWeatherDictionaryLG.Create;
131137 // Create a TStringList for sorting
@@ -134,8 +140,12 @@ constructor TWeatherStation.Create(const filename: string);
134140
135141destructor TWeatherStation.Destroy;
136142var
137- stationName:string;
143+ stationName: string;
138144begin
145+
146+ // Free the lookup dictionary
147+ self.lookupStrFloatToIntList.Free;
148+
139149 // Free TStringList dictionary
140150 weatherStationList.Free;
141151
@@ -147,6 +157,30 @@ destructor TWeatherStation.Destroy;
147157 weatherDictionary.Free;
148158end ;
149159
160+ procedure TWeatherStation.CreateLookupTemp ;
161+ var
162+ startTemp: int64 = -1000 ;
163+ finishTemp: int64 = 1000 ;
164+ currentTemp: int64;
165+ numStr: string;
166+ begin
167+
168+ currentTemp := startTemp;
169+
170+ while currentTemp <> finishTemp do
171+ begin
172+ self.lookupStrFloatToIntList.Add(formatfloat(' 0.0' , currentTemp / 10 ), currentTemp);
173+ currentTemp := currentTemp + 1 ;
174+ end ;
175+
176+ { $ifdef DEBUG}
177+ for numStr in self.lookupStrFloatToIntList.Keys do
178+ WriteLn(' We have key: ' , numStr, ' with value of: ' ,
179+ IntToStr(self.lookupStrFloatToIntList[numStr]));
180+ Writeln(self.lookupStrFloatToIntList.Count);
181+ { $endif DEBUG}
182+ end ;
183+
150184procedure TWeatherStation.PrintSortedWeatherStationAndStats ;
151185var
152186 outputList: string;
@@ -191,7 +225,7 @@ procedure TWeatherStation.SortWeatherStationAndStats;
191225
192226 wsKey := ' ' ;
193227
194- if self.weatherDictionary.GetCapacity = 0 then
228+ if self.weatherDictionary.Count = 0 then
195229 begin
196230 WriteLn(' Nothing to Sort.' );
197231 Exit;
@@ -204,7 +238,6 @@ procedure TWeatherStation.SortWeatherStationAndStats;
204238
205239 self.weatherStationList.CustomSort(@CustomTStringListComparer);
206240
207-
208241 { $IFDEF DEBUG}
209242 // Display the line.
210243 WriteLn(' Sorting done: ' , DateTimeToStr(Now));
@@ -222,14 +255,18 @@ procedure TWeatherStation.AddCityTemperatureLG(const cityName: string;
222255 // Get the temp record
223256 stat := self.weatherDictionary[cityName];
224257
225- // If the temp lower then min, set the new min.
226- if newTemp < stat^.min then
227- stat^.min := newTemp;
228-
229- // If the temp higher than max, set the new max.
230- if newTemp > stat^.max then
231- stat^.max := newTemp;
232-
258+ // Update min and max temps if needed
259+ // Re-arranged the if statement, to achieve minimal if checks.
260+ // This saves approx 15 seconds when processing 1 billion row.
261+ if (newTemp < stat^.min) or (newTemp > stat^.max) then
262+ begin
263+ // If the temp lower then min, set the new min.
264+ if newTemp < stat^.min then
265+ stat^.min := newTemp;
266+ // If the temp higher than max, set the new max.
267+ if newTemp > stat^.max then
268+ stat^.max := newTemp;
269+ end ;
233270 // Add count for this city.
234271 stat^.sum := stat^.sum + newTemp;
235272
@@ -242,11 +279,12 @@ procedure TWeatherStation.AddCityTemperatureLG(const cityName: string;
242279 // Display the line.
243280 WriteLn(' Updated: ' , cityName);
244281 { $ENDIF DEBUG}
245- end ;
246-
247- // If city name doesn't exist add a new entry
248- if not self.weatherDictionary.Contains(cityName) then
282+ end
283+ else
249284 begin
285+ // Re-arranged this if portion also to achieve minimal if checks.
286+ // This saves approx 15 seconds when processing 1 billion row.
287+ // If city name doesn't exist add a new entry
250288 New(stat);
251289 stat^.min := newTemp;
252290 stat^.max := newTemp;
@@ -265,13 +303,33 @@ procedure TWeatherStation.AddCityTemperatureLG(const cityName: string;
265303procedure TWeatherStation.ParseStationAndTemp (const line: string);
266304var
267305 delimiterPos: integer;
268- parsedStation, strTemp: string;
306+ parsedStation, strFloatTemp: string;
307+ results: array of string;
269308 parsedTemp, valCode: int64;
270309begin
271-
272310 // Get position of the delimiter
273311 delimiterPos := Pos(' ;' , line);
274312 if delimiterPos > 0 then
313+ begin
314+ // Get the weather station name
315+ // Using Copy and POS - as suggested by Gemini AI.
316+ // This part saves 3 mins faster when processing 1 billion rows.
317+ // parsedStation := Copy(line, 1, delimiterPos - 1);
318+ strFloatTemp := Copy(line, delimiterPos + 1 , Length(line));
319+
320+ // Using a lookup value speeds up 30-45 seconds
321+ if self.lookupStrFloatToIntList.Contains(strFloatTemp) then
322+ begin
323+ parsedTemp := self.lookupStrFloatToIntList[strFloatTemp];
324+ self.AddCityTemperatureLG(Copy(line, 1 , delimiterPos - 1 ),
325+ parsedTemp);
326+ end ;
327+
328+ end ;
329+
330+ { // Get position of the delimiter
331+ delimiterPos := Pos(';', line);
332+ if delimiterPos > 0 then
275333 begin
276334 // Get the weather station name
277335 // Using Copy and POS - as suggested by Gemini AI.
@@ -280,21 +338,24 @@ procedure TWeatherStation.ParseStationAndTemp(const line: string);
280338
281339 // Get the temperature recorded, as string, remove '.' from string float
282340 // because we want to save it as int64.
283- strTemp := Copy(line, delimiterPos + 1 , Length(line));
284- // strTemp := StringReplace(strTemp, '.', '', [rfReplaceAll]);
341+ strFloatTemp := Copy(line, delimiterPos + 1, Length(line));
342+
343+ // strFloatTemp := StringReplace(strFloatTemp, '.', '', [rfReplaceAll]);
285344 // The above operation is a bit expensive.
286345 // Rewrote a simple function which prevents creation of new string
287346 // in each iteration. Saved approx 20-30 seconds for 1 billion row.
288347 // Remove dots turns a float into an int.
289- strTemp := RemoveDots(strTemp );
348+ strFloatTemp := RemoveDots(strFloatTemp );
290349
291350 // Add the weather station and the recorded temp (as int64) in the TDictionary
292- Val(strTemp, parsedTemp, valCode);
351+ Val(strFloatTemp,
352+ parsedTemp,
353+ valCode);
293354 if valCode <> 0 then Exit;
294355
295356 // Add a record in TWeatherDictionary
296357 self.AddCityTemperatureLG(parsedStation, parsedTemp);
297- end ;
358+ end;}
298359end ;
299360
300361procedure TWeatherStation.ReadMeasurements ;
@@ -312,7 +373,7 @@ procedure TWeatherStation.ReadMeasurements;
312373 // Read and parse chunks of data until EOF -------------------------------
313374 while not streamReader.EOF do
314375 begin
315- // line := streamReader.ReadLine;
376+ // line := streamReader.ReadLine;
316377 self.ParseStationAndTemp(streamReader.ReadLine);
317378 end ;// End of read and parse chunks of data ------------------------------
318379 finally
@@ -324,132 +385,12 @@ procedure TWeatherStation.ReadMeasurements;
324385 end ;
325386end ;
326387
327- procedure TWeatherStation.ReadMeasurementsClassic ;
328- var
329- inputFile: System.TextFile;
330- textBuffer: array [1 ..131072 ] of byte;
331- line: string;
332- begin
333-
334- // Open the file for reading
335- AssignFile(inputFile, self.fname);
336- SetTextBuf(inputFile, textBuffer);
337- try
338- Reset(inputFile);
339-
340- // Read and parse chunks of data until EOF -------------------------------
341- while not EOF(inputFile) do
342- begin
343- ReadLn(inputFile, line);
344- self.ParseStationAndTemp(line);
345- end ;// End of read and parse chunks of data ------------------------------
346-
347- finally
348- // Close the file
349- CloseFile(inputFile);
350- end ;
351- end ;
352-
353- procedure TWeatherStation.ParseStationAndTempFromChunk (const chunkData: pansichar;
354- const dataSize: int64; const chunkIndex: int64);
355- var
356- index, lineStart, lineLength: int64;
357- begin
358- lineStart := 0 ;
359-
360- // Check for Line Feed (LF)
361- for index := 0 to dataSize - 1 do
362- begin
363- if chunkData[index] = #10 then
364- begin
365-
366- lineLength := index - lineStart;
367-
368- // Remove potential CR before LF (for Windows)
369- if (chunkData[index - 1 ] = #13 ) and (index < dataSize - 1 ) then
370- Dec(LineLength);
371-
372- // The current line is now: Buffer[LineStart..LineStart+LineLength-1]
373- // WriteLn(chunkData[lineStart..lineStart + lineLength - 1], '.');
374- self.ParseStationAndTemp(chunkData[lineStart..lineStart + lineLength - 1 ]);
375- // Skip to the next 'line' in the buffer
376- lineStart := index + 1 ;
377- end ;
378- end ;
379- end ;
380-
381- procedure TWeatherStation.ReadMeasurementsInChunks (const filename: string);
382- const
383- defaultChunkSize: int64 = 67108864 ; // 64MB in bytes
384- var
385- fileStream: TFileStream;
386- buffer: pansichar;
387- bytesRead, totalBytesRead, chunkSize, lineBreakPos, chunkIndex: int64;
388- begin
389-
390- chunkSize := defaultChunkSize * 4 ; // 256MB in bytes
391-
392- // Open the file for reading
393- fileStream := TFileStream.Create(filename, fmOpenRead or fmShareDenyWrite);
394- try
395- // Allocate memory buffer for reading chunks
396- // Ref: https://www.freepascal.org/docs-html/rtl/system/getmem.html
397- GetMem(buffer, chunkSize);
398- try
399- totalBytesRead := 0 ;
400- chunkIndex := 0 ;
401-
402- // Read and parse chunks of data until EOF
403- while totalBytesRead < fileStream.Size do
404- begin
405- { $IFDEF DEBUG}
406- WriteLn(' Processing chunk index: ' , IntToStr(chunkIndex));
407- { $ENDIF DEBUG}
408-
409- bytesRead := fileStream.Read(buffer^, chunkSize);
410-
411- // Update total bytes read
412- Inc(totalBytesRead, bytesRead);
413-
414- // Find the position of the last newline character in the chunk
415- lineBreakPos := BytesRead;
416- while (lineBreakPos > 0 ) and (Buffer[lineBreakPos - 1 ] <> #10 ) do
417- Dec(lineBreakPos);
418-
419- { Now, must ensure that if the last byte read in the current chunk
420- is not a newline character, the file pointer is moved back to include
421- that byte and any preceding bytes of the partial line in the next
422- chunk's read operation.
423-
424- Also, no need to update the BytesRead variable in this context because
425- it represents the actual number of bytes read from the file, including
426- any partial line that may have been included due to moving the file
427- pointer back.
428- Ref: https://www.freepascal.org/docs-html/rtl/classes/tstream.seek.html}
429- if lineBreakPos < bytesRead then
430- fileStream.Seek(-(bytesRead - lineBreakPos), soCurrent);
431-
432- // Parse the buffer line by line here
433- // This is to slow!
434- self.ParseStationAndTempFromChunk(buffer, lineBreakPos, chunkIndex);
435-
436- // Increase chunk index - a counter
437- Inc(chunkIndex);
438- end ;
439- finally
440- // Free the memory buffer
441- FreeMem(buffer);
442- end ;
443- finally
444- // Close the file
445- fileStream.Free;
446- end ;
447- end ;
448-
449388// The main algorithm
450389procedure TWeatherStation.ProcessMeasurements ;
451390begin
391+ self.CreateLookupTemp;
452392 self.ReadMeasurements;
393+ // self.ReadMeasurementsBuf;
453394 // self.ReadMeasurementsClassic;
454395 { This chunking method cuts ~ 30 - 40 seconds of processing time from ~6.45 to 6.00
455396 But the SHA256 at the end is incorrect}
0 commit comments