Skip to content

Commit d04e85a

Browse files
committed
Added a dev ersion with comments
1 parent 97f16d4 commit d04e85a

File tree

2 files changed

+369
-27
lines changed

2 files changed

+369
-27
lines changed

Chunker-DEV.php

Lines changed: 369 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,369 @@
1+
<?php
2+
3+
if(!class_exists("Chunker")){
4+
5+
/**
6+
* A lightweight, fast, and optimized XML file splitter with build in tag data validation, written with the XMLParser library. The main goal of this is to split an XML file into multiple small chunks (hence the name), then save it into multiple different little XML files, so that slower servers, plugins etc can process XML files with more than even 10.000+ records. It is built on XMLParser, a powerful php xml processing library.
7+
*
8+
* MINIMUM PHP VERSION: 7.4
9+
*
10+
* @author Borsodi Gergő
11+
* @version 1.0
12+
* @link https://github.com/borsodigerii/php-xml-chunker
13+
*
14+
*/
15+
class Chunker{
16+
17+
/**
18+
* The name of the file to be processed.
19+
* @var string
20+
*/
21+
private string $xmlFile;
22+
23+
/**
24+
* The maximum chunksize.
25+
* @var int
26+
*/
27+
private int $chunkSize;
28+
29+
/**
30+
* Counter for the chunks.
31+
* @var int
32+
*/
33+
private int $CHUNKS;
34+
35+
/**
36+
* The data that will be written into a chunk.
37+
* @var string
38+
*/
39+
private string $PAYLOAD = '';
40+
41+
/**
42+
* The data used for one iteration of the main tag.
43+
* @var string
44+
*/
45+
private string $PAYLOAD_TEMP = '';
46+
47+
/**
48+
* A container used to implement validation.
49+
* @var
50+
*/
51+
private string $DATA_BETWEEN = '';
52+
53+
/**
54+
* The root tag of the yet-to-process xml file.
55+
* @var string
56+
*/
57+
private string $rootTag;
58+
59+
/**
60+
* The charset used for the decoding/encoding process.
61+
* @var string
62+
*/
63+
private string $CHARSET;
64+
65+
/**
66+
* The prefix used for the output files.
67+
* @var string
68+
*/
69+
private string $outputFilePrefix;
70+
71+
/**
72+
* Counter for the items put into one chunk.
73+
* @var int
74+
*/
75+
private int $ITEMCOUNT = 0;
76+
77+
/**
78+
* The main tag, of which defines one item in the chunking.
79+
* @var string
80+
*/
81+
private string $CHUNKON;
82+
83+
/**
84+
* A variable used for logging.
85+
* @var string
86+
*/
87+
private string $log = "";
88+
89+
/**
90+
* The total number of processed main tags.
91+
* @var int
92+
*/
93+
private int $totalItems = 0;
94+
95+
/**
96+
* A variable that indicates if a maintag that doesn't satisfy the validation has been found.
97+
* @var bool
98+
*/
99+
private bool $excludedItemFound = false;
100+
101+
/**
102+
* A variable to indicate that the next data that will be read, has to be validated since its opening tag is present in $checkingTags.
103+
* @var bool
104+
*/
105+
private bool $checkNextData = false;
106+
107+
/**
108+
* A variable that carries the tagname of the data that is about to be validated.
109+
* @var string
110+
*/
111+
private string $checkNextDataTag = '';
112+
113+
/**
114+
* An array of tags, where their data has to be validated runtime.
115+
* @var array
116+
*/
117+
private array $checkingTags = array();
118+
119+
/**
120+
* A callback function, that processes the validation. Has to be a callable.
121+
* @var callable
122+
*/
123+
private $passesValidation;
124+
125+
/**
126+
* The constructor of the class, it creates an instance of Chunker.
127+
*
128+
* @param string $xmlfile The path of the xml file
129+
* @param int $chunkSize The number of which every little/chunked file should maximum contain from the main XML tag specified lated. **Default: 100**
130+
* @param string $outputFilePrefix The name that will be the prefix for the chunk's filenames. The pattern is the following: *{outputFilePrefix}{CHUNK_NUMBER}.xml* **Default: 'out-'.** Example files with the default prefix: 'out-1.xml', 'out-2.xml' etc
131+
* @param callable $validationFunction The validator function to be run every time the parser has found a tag, that is in $checkingTags. If it did, it runs the validator through the tag, and if the function returned **true** (so the tag data was *valid*), it includes it in the chunk, otherwise ignores it. The validator function has to return **bool**, and cannot be **null**. If it is null, a Fatal error will be raised. The passed callback HAS to have the following parameters:
132+
* - $data: string, the currently processed tag data (what is inside the tag) will be inside this parameter
133+
* - $tag: string, the currently processed tagname will be inside this parameter
134+
* @param array $checkingTags This array consists of tagnames where the data inside the tag has to be validated. It can be empty, and can be omitted, if no validation is required (not like the validator function, which HAS to be provided through here, otherwise an error will be raised)
135+
* @return void A new Chunker is generated.
136+
*/
137+
public function __construct(string $xmlfile = "", int $chunkSize = 100, string $outputFilePrefix = 'out-', callable $validationFunction = null, array $checkingTags = array())
138+
{
139+
if(empty($xmlfile)) trigger_error("[Chunker] Fatal error: no XML file/empty filestring specified in __construct.", E_USER_ERROR);
140+
if(!$validationFunction) trigger_error("[Chunker] Fatal error: no callback handler specified for validation checks.", E_USER_ERROR);
141+
$this->checkingTags = $checkingTags;
142+
$this->passesValidation = $validationFunction;
143+
$this->xmlFile = $xmlfile;
144+
$this->chunkSize = $chunkSize;
145+
$this->CHUNKS = 0;
146+
$this->outputFilePrefix = $outputFilePrefix;
147+
}
148+
149+
/**
150+
* This function processes a whole chunk (max size <= $chunkSize) by writing the **PAYLOAD** into a chunkfile, and resetting all stationary variables.
151+
* @param bool $lastChunk Indicates if the current is the last chunk in the file. Sometimes if its not indicated, and it is the last chunk, the closing tag is not always present.
152+
* @return void
153+
*/
154+
private function processChunk($lastChunk = false) {
155+
$this->logging("Writing new chunk..");
156+
if ('' == $this->PAYLOAD) {
157+
$this->logging("Empty PAYLOAD. Returning.");
158+
return;
159+
}
160+
$xp = fopen($file = $this->outputFilePrefix . "" . $this->CHUNKS . ".xml", "w");
161+
/*fwrite($xp, '<?xml version="1.0" encoding="'.$this->CHARSET.'"?>'."\n");*/
162+
fwrite($xp, '<?xml version="1.0" encoding="'.strtolower($this->CHARSET).'"?>'."\n");
163+
fwrite($xp, '<'.$this->rootTag.' xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">');
164+
fwrite($xp, $this->PAYLOAD);
165+
$lastChunk || fwrite($xp, "</".$this->rootTag.">");
166+
fclose($xp);
167+
$this->logging("Written {$file}");
168+
$this->CHUNKS++;
169+
$this->PAYLOAD = '';
170+
$this->PAYLOAD_TEMP = '';
171+
$this->DATA_BETWEEN = '';
172+
$this->excludedItemFound = false;
173+
$this->checkNextData = false;
174+
$this->checkNextDataTag = '';
175+
$this->ITEMCOUNT = 0;
176+
}
177+
178+
/**
179+
* A handler function used by the parser for starting elements. It checks if the currently parsed tag is present in the $checkingTags array, and sets some stationary variables if a validation needs to be done.
180+
* @param XMLParser $xml The parser
181+
* @param string $tag the currently parsed tag
182+
* @param array $attrs An array of attributes of the tag. We dont use it here, so it is only there for syntax purposes
183+
*/
184+
private function startElement($xml, $tag, $attrs = array()) {
185+
//GLOBAL $PAYLOAD, $CHUNKS, $ITEMCOUNT, $CHUNKON;
186+
//$this->logging("New tag element: " .$tag);
187+
/*if (!($this->CHUNKS||$this->ITEMCOUNT)) {
188+
if ($this->CHUNKON == $tag) {
189+
$this->PAYLOAD = '';
190+
}
191+
}*/
192+
//if ("weight_kg" == $tag || "categoryText" == $tag || "product" == $tag || "serie" == $tag) {
193+
if(in_array($tag, $this->checkingTags)){
194+
// checkable tag found
195+
$this->checkNextData = true;
196+
$this->checkNextDataTag = $tag;
197+
}
198+
//$this->PAYLOAD .= "<{$tag}";
199+
$this->PAYLOAD_TEMP .= "<{$tag}>";
200+
/*foreach($attrs as $k => $v) {
201+
$this->PAYLOAD .= " {$k}=\"" .addslashes($v).'"';
202+
}*/
203+
//$this->PAYLOAD .= '>';
204+
}
205+
206+
/**
207+
* A handler function used by the parser for ending elements. It checks if the currently parsed main tag had any tags that were present in the $checkingTags array, and had data that couldn't have been validated. If true, the lastly parsed main element will be excluded from the chunking process, and will be written into a chunk file otherwise. If the processed main tag's number has reached the $chunkSize limit, a new chunk will be written to the disk.
208+
* @param XMLParser $xml The parser
209+
* @param string $tag the currently parsed tag
210+
*/
211+
private function endElement($xml, $tag) {
212+
//GLOBAL $CHUNKON, $ITEMCOUNT, $ITEMLIMIT;
213+
//$this->logging("New closing element: " .$tag);
214+
if($this->checkNextData && $this->checkNextDataTag == $tag){
215+
// ezt az adatot validalni kell
216+
//if(!$this->passesValidation($data, $this->checkNextDataTag)) $this->excludedItemFound = true;
217+
if(!call_user_func($this->passesValidation, $this->DATA_BETWEEN, $this->checkNextDataTag)) $this->excludedItemFound = true;
218+
$this->checkNextData = false;
219+
$this->checkNextDataTag = '';
220+
}
221+
$this->dataHandler(null, "</{$tag}>");
222+
$this->DATA_BETWEEN = '';
223+
//$this->PAYLOAD_TEMP = $this->PAYLOAD_TEMP . "".$this->DATA_BETWEEN . "</{$tag}>\n";
224+
if ($this->CHUNKON == $tag) {
225+
$this->logging("Closing ".$this->CHUNKON." element found");
226+
227+
if($this->excludedItemFound){
228+
// volt nem passzolo item
229+
$this->logging("Excluded item found, skipping current " .$this->CHUNKON."..");
230+
$this->PAYLOAD_TEMP = '';
231+
$this->DATA_BETWEEN = '';
232+
$this->excludedItemFound = false;
233+
$this->checkNextData = false;
234+
$this->checkNextDataTag = '';
235+
return;
236+
}
237+
$this->PAYLOAD .= $this->PAYLOAD_TEMP;
238+
$this->PAYLOAD_TEMP = '';
239+
$this->DATA_BETWEEN = '';
240+
$this->totalItems++;
241+
if (++$this->ITEMCOUNT >= $this->chunkSize) {
242+
$this->logging("Chunk limit reached, printing chunk...");
243+
$this->processChunk();
244+
}
245+
}
246+
}
247+
248+
/**
249+
* A handler function used by the parser for data between tags. If the $checkNextData stationary property was set to true, then it means, that the currently parsed data has to be validated. It it did not pass the validation, the main element will be flagged as 'excluded from chunking', and will not be written to disk.
250+
* @param XMLParser $xml The parser
251+
* @param string $data The data to be handled
252+
*/
253+
private function dataHandler($xml, $data) {
254+
//GLOBAL $PAYLOAD;
255+
256+
257+
258+
$this->DATA_BETWEEN .= $data;
259+
$this->PAYLOAD_TEMP .= $data;
260+
}
261+
262+
/**
263+
* A handler function, not used by this class, just for formal purposes.
264+
*/
265+
private function defaultHandler($xml, $data) {
266+
// a.k.a. Wild Text Fallback Handler, or WTFHandler for short.
267+
$this->logging("WTF text found: " .$data);
268+
}
269+
270+
/**
271+
* A helper function that creates the XML parser instance, sets the options for the parsing, and establishes the setup.
272+
* @param string $CHARSET The charset that will be used by the parser. **Default: "UTF-8"**
273+
* @param bool $bareXML Indicates if the incoming data is unformatted/maybe invalid XML. Not used in this class.
274+
* @return XMLParser The created parser instance
275+
*/
276+
private function createXMLParser($CHARSET = "UTF-8", $bareXML = false) {
277+
$CURRXML = xml_parser_create($CHARSET);
278+
xml_parser_set_option( $CURRXML, XML_OPTION_CASE_FOLDING, false);
279+
xml_parser_set_option( $CURRXML, XML_OPTION_TARGET_ENCODING, $CHARSET);
280+
xml_set_element_handler($CURRXML, [$this, 'startElement'], [$this, 'endElement']);
281+
xml_set_character_data_handler($CURRXML, [$this, 'dataHandler']);
282+
xml_set_default_handler($CURRXML, [$this, 'defaultHandler']);
283+
if ($bareXML) {
284+
/*xml_parse($CURRXML, '<?xml version="1.0" encoding="'.$this->CHARSET.'"?>', 0);*/
285+
xml_parse($CURRXML, '<?xml version="1.0" encoding="'.$CHARSET.'"?>', 0);
286+
}
287+
$this->logging("Created XML Parser");
288+
return $CURRXML;
289+
}
290+
291+
/**
292+
* A funcion to start the chunking process. It will initiate the parsint instance, and start the XML parsing, along with the chunking of the data in every specified $chunkSize intervals.
293+
* @param string $mainTag The tag of which will be used to count the number of main elements in a chunk. Usually the second-level XML tag in a document.
294+
* @param string $rootTag The root tag of which every other $mainTag is the children of. There is only one of this in an XML document (not the XML header, which is in the first row).
295+
* @param string $charset The character set used by the parser. **Default: UTF-8** Possible values: "UTF-8", "ISO-8859-1"
296+
*
297+
* @return string The main log that was created during the chunking
298+
*/
299+
public function chunkXML($mainTag = 'shopItem', $rootTag = 'Shop', $charset = "UTF-8") {
300+
//GLOBAL $CHUNKON, $CHUNKS, $ITEMLIMIT;
301+
302+
// Every chunk only holds $ITEMLIMIT "$CHUNKON" elements at most.
303+
$this->rootTag = $rootTag;
304+
$this->CHARSET = $charset;
305+
$this->CHUNKON = $mainTag;
306+
307+
$this->logging("Starting new Chunking.*****", true);
308+
$this->logging("Internal encoding: " .print_r(iconv_get_encoding(), true));
309+
$xml = $this->createXMLParser($this->CHARSET, false);
310+
311+
$fp = fopen($this->xmlFile, 'r');
312+
if(!$fp){
313+
trigger_error("Could not open XML file", E_USER_ERROR);
314+
}
315+
$this->logging("Opened XML File");
316+
$this->CHUNKS = 0;
317+
$this->totalItems = 0;
318+
$this->excludedItemFound = false;
319+
$this->checkNextData = false;
320+
$this->checkNextDataTag = '';
321+
$this->PAYLOAD = '';
322+
$this->PAYLOAD_TEMP = '';
323+
$this->DATA_BETWEEN = '';
324+
while(!feof($fp)) {
325+
//$this->logging("Reading new line...");
326+
$chunk = fgets($fp, 102400);
327+
$this->logging("Reading line: " .$chunk);
328+
if(!$chunk){
329+
$this->logging("Reading new line failed, next try");
330+
}
331+
if(xml_parse($xml, $chunk, feof($fp)) == 0){
332+
$this->logging("Could not parse line. Next try...");
333+
}
334+
335+
}
336+
xml_parser_free($xml);
337+
338+
// Now, it is possible that one last chunk is still queued for processing.
339+
$this->processChunk(true);
340+
$this->logging("Internal encoding: " .print_r(iconv_get_encoding(), true));
341+
$this->logging("Ended chunking. Total processed '" .$this->CHUNKON."' objects: " .$this->totalItems);
342+
return $this->log;
343+
}
344+
/**
345+
* Used for administrative purposes. A message can be logged into the internal logging variable, and then later be returned/passed back as value by some functions.
346+
* @param string $msg The message to be logged
347+
* @param bool $start Indicates if the logging has to be started over (so the past logged messages will be deleted, and a cleared loggin variable will be set). **Default: false**
348+
*/
349+
public function logging($msg, $start = false){
350+
351+
if($start){
352+
$this->log = "[" .(new DateTime())->format("y:m:d h:i:s"). "] " .$msg. "\n\r";
353+
return;
354+
}
355+
$this->log .= "[" .(new DateTime())->format("y:m:d h:i:s"). "] " .$msg. "\n\r";
356+
357+
}
358+
}
359+
360+
}
361+
362+
if (!function_exists('str_contains')) {
363+
function str_contains(string $haystack, string $needle): bool
364+
{
365+
return '' === $needle || false !== strpos($haystack, $needle);
366+
}
367+
}
368+
369+
?>

0 commit comments

Comments
 (0)