Skip to content

Commit d05590e

Browse files
committed
first commit
0 parents  commit d05590e

File tree

4 files changed

+971054
-0
lines changed

4 files changed

+971054
-0
lines changed

.gitignore

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
/output-0.xml
2+
/output-1.xml
3+
/output-2.xml
4+
/output-3.xml
5+
/output-4.xml
6+
/output-5.xml
7+
/output-6.xml
8+
/output-7.xml

Chunker.php

Lines changed: 254 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,254 @@
1+
<?php
2+
3+
if(!class_exists("Chunker")){
4+
5+
/**
6+
* A lightweight, fast, and optimized XML file splitter with build in tag data validation, written with the XMLParser library. The main goal of this is to split an XML file into multiple small chunks (hence the name), then save it into multiple different little XML files, so that slower servers, plugins etc can process XML files with more than even 10.000+ records. It is built on XMLParser, a powerful php xml processing library.
7+
*
8+
* @author Borsodi Gergő
9+
* @version 1.0
10+
*/
11+
class Chunker{
12+
13+
private string $xmlFile;
14+
private readonly int $chunkSize;
15+
private int $CHUNKS;
16+
private string $PAYLOAD = '';
17+
private string $PAYLOAD_TEMP = '';
18+
private string $rootTag;
19+
private string $CHARSET;
20+
private string $outputFilePrefix;
21+
private int $ITEMCOUNT = 0;
22+
private string $CHUNKON;
23+
private string $log = "";
24+
private int $totalItems = 0;
25+
private bool $excludedItemFound = false;
26+
private bool $checkNextData = false;
27+
private string $checkNextDataTag = '';
28+
private array $checkingTags = array();
29+
private $passesValidation;
30+
31+
/**
32+
* The constructor of the class, it creates an instance of Chunker.
33+
*
34+
* @param string $xmlfile The path of the xml file
35+
* @param int $chunkSize The number of which every little/chunked file should maximum contain from the main XML tag specified lated. **Default: 100**
36+
* @param string $outputFilePrefix The name that will be the prefix for the chunk's filenames. The pattern is the following: *{outputFilePrefix}{CHUNK_NUMBER}.xml* **Default: 'out-'.** Example files with the default prefix: 'out-1.xml', 'out-2.xml' etc
37+
* @param callable $validationFunction The validator function to be run every time the parser has found a tag, that is in $checkingTags. If it did, it runs the validator through the tag, and if the function returned **true** (so the tag data was *valid*), it includes it in the chunk, otherwise ignores it. The validator function has to return **bool**, and cannot be **null**. If it is null, a Fatal error will be raised. The passed callback HAS to have the following parameters:
38+
* - $data: string, the currently processed tag data (what is inside the tag) will be inside this parameter
39+
* - $tag: string, the currently processed tagname will be inside this parameter
40+
* @param array $checkingTags This array consists of tagnames where the data inside the tag has to be validated. It can be empty, and can be omitted, if no validation is required (not like the validator function, which HAS to be provided through here, otherwise an error will be raised)
41+
* @return void A new Chunker is generated.
42+
*/
43+
public function __construct(string $xmlfile = "", int $chunkSize = 100, string $outputFilePrefix = 'out-', callable $validationFunction = null, array $checkingTags = array())
44+
{
45+
if(empty($xmlfile)) trigger_error("[Chunker] Fatal error: no XML file/empty filestring specified in __construct.", E_USER_ERROR);
46+
if(!$validationFunction) trigger_error("[Chunker] Fatal error: no callback handler specified for validation checks.", E_USER_ERROR);
47+
$this->checkingTags = $checkingTags;
48+
$this->passesValidation = $validationFunction;
49+
$this->xmlFile = $xmlfile;
50+
$this->chunkSize = $chunkSize;
51+
$this->CHUNKS = 0;
52+
$this->outputFilePrefix = $outputFilePrefix;
53+
}
54+
55+
/**
56+
* This function processes a whole chunk (max size <= $chunkSize) by writing the **PAYLOAD** into a chunkfile, and resetting all stationary variables.
57+
* @param bool $lastChunk Indicates if the current is the last chunk in the file. Sometimes if its not indicated, and it is the last chunk, the closing tag is not always present.
58+
* @return void
59+
*/
60+
private function processChunk($lastChunk = false) {
61+
$this->logging("Writing new chunk..");
62+
if ('' == $this->PAYLOAD) {
63+
$this->logging("Empty PAYLOAD. Returning.");
64+
return;
65+
}
66+
$xp = fopen($file = $this->outputFilePrefix . "" . $this->CHUNKS . ".xml", "w");
67+
fwrite($xp, '<?xml version="1.0" encoding="'.$this->CHARSET.'"?>'."\n");
68+
fwrite($xp, '<'.$this->rootTag.' xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">');
69+
fwrite($xp, $this->PAYLOAD);
70+
$lastChunk || fwrite($xp, "</'.$this->rootTag.'>");
71+
fclose($xp);
72+
$this->logging("Written {$file}");
73+
$this->CHUNKS++;
74+
$this->PAYLOAD = '';
75+
$this->PAYLOAD_TEMP = '';
76+
$this->excludedItemFound = false;
77+
$this->checkNextData = false;
78+
$this->checkNextDataTag = '';
79+
$this->ITEMCOUNT = 0;
80+
}
81+
82+
/**
83+
* A handler function used by the parser for starting elements. It checks if the currently parsed tag is present in the $checkingTags array, and sets some stationary variables if a validation needs to be done.
84+
* @param XMLParser $xml The parser
85+
* @param string $tag the currently parsed tag
86+
* @param array $attrs An array of attributes of the tag. We dont use it here, so it is only there for syntax purposes
87+
*/
88+
private function startElement($xml, $tag, $attrs = array()) {
89+
//GLOBAL $PAYLOAD, $CHUNKS, $ITEMCOUNT, $CHUNKON;
90+
//$this->logging("New tag element: " .$tag);
91+
/*if (!($this->CHUNKS||$this->ITEMCOUNT)) {
92+
if ($this->CHUNKON == $tag) {
93+
$this->PAYLOAD = '';
94+
}
95+
}*/
96+
//if ("weight_kg" == $tag || "categoryText" == $tag || "product" == $tag || "serie" == $tag) {
97+
if(in_array($tag, $this->checkingTags)){
98+
// checkable tag found
99+
$this->checkNextData = true;
100+
$this->checkNextDataTag = $tag;
101+
}
102+
//$this->PAYLOAD .= "<{$tag}";
103+
$this->PAYLOAD_TEMP .= "<{$tag}>";
104+
/*foreach($attrs as $k => $v) {
105+
$this->PAYLOAD .= " {$k}=\"" .addslashes($v).'"';
106+
}*/
107+
//$this->PAYLOAD .= '>';
108+
}
109+
110+
/**
111+
* A handler function used by the parser for ending elements. It checks if the currently parsed main tag had any tags that were present in the $checkingTags array, and had data that couldn't have been validated. If true, the lastly parsed main element will be excluded from the chunking process, and will be written into a chunk file otherwise. If the processed main tag's number has reached the $chunkSize limit, a new chunk will be written to the disk.
112+
* @param XMLParser $xml The parser
113+
* @param string $tag the currently parsed tag
114+
*/
115+
private function endElement($xml, $tag) {
116+
//GLOBAL $CHUNKON, $ITEMCOUNT, $ITEMLIMIT;
117+
//$this->logging("New closing element: " .$tag);
118+
$this->dataHandler(null, "</{$tag}>");
119+
if ($this->CHUNKON == $tag) {
120+
$this->logging("Closing ".$this->CHUNKON." element found");
121+
122+
if($this->excludedItemFound){
123+
// volt nem passzolo item
124+
$this->logging("Excluded item found, skipping current " .$this->CHUNKON."..");
125+
$this->PAYLOAD_TEMP = '';
126+
$this->excludedItemFound = false;
127+
$this->checkNextData = false;
128+
$this->checkNextDataTag = '';
129+
return;
130+
}
131+
$this->PAYLOAD .= $this->PAYLOAD_TEMP;
132+
$this->PAYLOAD_TEMP = '';
133+
$this->totalItems++;
134+
if (++$this->ITEMCOUNT >= $this->chunkSize) {
135+
$this->logging("Chunk limit reached, printing chunk...");
136+
$this->processChunk();
137+
}
138+
}
139+
}
140+
141+
/**
142+
* A handler function used by the parser for data between tags. If the $checkNextData stationary property was set to true, then it means, that the currently parsed data has to be validated. It it did not pass the validation, the main element will be flagged as 'excluded from chunking', and will not be written to disk.
143+
* @param XMLParser $xml The parser
144+
* @param string $data The data to be handled
145+
*/
146+
private function dataHandler($xml, $data) {
147+
//GLOBAL $PAYLOAD;
148+
149+
if($this->checkNextData){
150+
// ezt az adatot validalni kell
151+
//if(!$this->passesValidation($data, $this->checkNextDataTag)) $this->excludedItemFound = true;
152+
if(!call_user_func($this->passesValidation, $data, $this->checkNextDataTag)) $this->excludedItemFound = true;
153+
$this->checkNextData = false;
154+
$this->checkNextDataTag = '';
155+
}
156+
157+
$this->PAYLOAD_TEMP .= $data;
158+
}
159+
160+
/**
161+
* A handler function, not used by this class, just for formal purposes.
162+
*/
163+
private function defaultHandler($xml, $data) {
164+
// a.k.a. Wild Text Fallback Handler, or WTFHandler for short.
165+
}
166+
167+
/**
168+
* A helper function that creates the XML parser instance, sets the options for the parsing, and establishes the setup.
169+
* @param string $CHARSET The charset that will be used by the parser. **Default: "UTF-8"**
170+
* @param bool $bareXML Indicates if the incoming data is unformatted/maybe invalid XML. Not used in this class.
171+
* @return XMLParser The created parser instance
172+
*/
173+
private function createXMLParser($CHARSET = "UTF-8", $bareXML = false) {
174+
$CURRXML = xml_parser_create($CHARSET);
175+
xml_parser_set_option( $CURRXML, XML_OPTION_CASE_FOLDING, false);
176+
xml_parser_set_option( $CURRXML, XML_OPTION_TARGET_ENCODING, $CHARSET);
177+
xml_set_element_handler($CURRXML, [$this, 'startElement'], [$this, 'endElement']);
178+
xml_set_character_data_handler($CURRXML, [$this, 'dataHandler']);
179+
xml_set_default_handler($CURRXML, [$this, 'defaultHandler']);
180+
if ($bareXML) {
181+
xml_parse($CURRXML, '<?xml version="1.0" encoding="'.$this->CHARSET.'"?>', 0);
182+
}
183+
$this->logging("Created XML Parser");
184+
return $CURRXML;
185+
}
186+
187+
/**
188+
* A funcion to start the chunking process. It will initiate the parsint instance, and start the XML parsing, along with the chunking of the data in every specified $chunkSize intervals.
189+
* @param string $mainTag The tag of which will be used to count the number of main elements in a chunk. Usually the second-level XML tag in a document.
190+
* @param string $rootTag The root tag of which every other $mainTag is the children of. There is only one of this in an XML document (not the XML header, which is in the first row).
191+
* @param string $charset The character set used by the parser. **Default: UTF-8**
192+
*
193+
* @return string The main log that was created during the chunking
194+
*/
195+
public function chunkXML($mainTag = 'shopItem', $rootTag = 'Shop', $charset = "UTF-8") {
196+
//GLOBAL $CHUNKON, $CHUNKS, $ITEMLIMIT;
197+
198+
// Every chunk only holds $ITEMLIMIT "$CHUNKON" elements at most.
199+
$this->rootTag = $rootTag;
200+
$this->CHARSET = $charset;
201+
$this->CHUNKON = $mainTag;
202+
203+
$this->logging("Starting new Chunking.*****", true);
204+
$xml = $this->createXMLParser($this->CHARSET, false);
205+
206+
$fp = fopen($this->xmlFile, 'r');
207+
if(!$fp){
208+
trigger_error("Could not open XML file", E_USER_ERROR);
209+
}
210+
$this->logging("Opened XML File");
211+
$this->CHUNKS = 0;
212+
$this->totalItems = 0;
213+
$this->excludedItemFound = false;
214+
$this->checkNextData = false;
215+
$this->checkNextDataTag = '';
216+
$this->PAYLOAD = '';
217+
$this->PAYLOAD_TEMP = '';
218+
while(!feof($fp)) {
219+
//$this->logging("Reading new line...");
220+
$chunk = fgets($fp, 10240);
221+
if(!$chunk){
222+
$this->logging("Reading new line failed, next try");
223+
}
224+
if(xml_parse($xml, $chunk, feof($fp)) == 0){
225+
$this->logging("Could not parse line. Next try...");
226+
}
227+
228+
}
229+
xml_parser_free($xml);
230+
231+
// Now, it is possible that one last chunk is still queued for processing.
232+
$this->processChunk(true);
233+
$this->logging("Ended chunking. Total processed '" .$this->CHUNKON."' objects: " .$this->totalItems);
234+
return nl2br($this->log);
235+
}
236+
/**
237+
* Used for administrative purposes. A message can be logged into the internal logging variable, and then later be returned/passed back as value by some functions.
238+
* @param string $msg The message to be logged
239+
* @param bool $start Indicates if the logging has to be started over (so the past logged messages will be deleted, and a cleared loggin variable will be set). **Default: false**
240+
*/
241+
private function logging($msg, $start = false){
242+
243+
if($start){
244+
$this->log = "[" .(new DateTime())->format("y:m:d h:i:s"). "] " .$msg. "\n\r";
245+
return;
246+
}
247+
$this->log .= "[" .(new DateTime())->format("y:m:d h:i:s"). "] " .$msg. "\n\r";
248+
249+
}
250+
}
251+
252+
}
253+
254+
?>

index.php

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
<!DOCTYPE html>
2+
<html lang="en">
3+
<head>
4+
<meta charset="UTF-8">
5+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
6+
<title>XML Chunker</title>
7+
</head>
8+
<body>
9+
<?php
10+
ini_set('display_errors', 1);
11+
ini_set('display_startup_errors', 1);
12+
error_reporting(E_ALL);
13+
require_once('Chunker.php');
14+
15+
function passesValidation($data, $tag): bool{
16+
switch ($tag) {
17+
case 'weight_kg':
18+
// suly, nem lehet ures es nagyobbnak kell lennie mint 0
19+
if(!empty($data) && intval($data) > 0){
20+
return true;
21+
}else{
22+
return false;
23+
}
24+
break;
25+
26+
case 'categoryText':
27+
$excludedItems = array("Burkolatok", "Padlóburkolatok", "Fürdőszobai csempe és burkolat", "Magasított álló kádcsapok", "Bidézuhany hideg vizes", "Bidézuhany fejek", "Orvosi karos csaptelepek", "Önzáró piszoár szelepek", "Önzáró zuhany szelepek", "Önzáró mosdó szelepek", "Önzáró WC szelepek", "Tartalék alkatrészek csaptelephez", "Szűrő rendszer", "Fürdőszobai kiegészítők", "Hidromasszázs rendszerek POLYSAN", "Világítás kádba", "Hidromasszázs kádak", "Vonalvezető épített zuhanyhoz", "Kiegészítő gyógyszertartó", "Tükrös szekrények | Materiál plast | PVC", "Tükrös szekrények | PVC", "Tükrös szekrények | PUBLIC", "Kiegészítők bútorokhoz", "Konzolok és pult tartók", "Kis bútorok WC-be | LATUS XI", "Kis bútorok WC-be | LATUS VI<", "Kis bútorok WC-be | LATUS VI ", "| Mosogatók |", "Fali kiöntők", "Öblítési módok", "Öblítéshez tápegység", "Piszoár térelválasztók", "Piszoár Kiegészítők", "WC elektronikus bidével", "WC-k bidézuhannyal és szeleppel vagy csapteleppel", "Közösségi helyiségbe", "Sarok nyílóajtós kabin", "Téglalap alakú zuhanykabin", "Íves aszimmetrikus zuhanykabin", "L- alakú fix fal nyitható résszel", "Háromoldalú zuhanykabinok", "Zuhanykabinok mély tálcával", "Kádparaván, pneumatikus működés", "Zuhanykabinok | Kiegészítők", "Ventilátorok", "Padlófűtések", "Radiátor szelepek", "Fogasok, törölközőszárítók", "Rozetták takaró idomok", "Szorítógyűrű", "Radiátor kiegészítők | Egyéb kiegészítők", "Elektromos radiátorok", "Elektromos törölköző szárítók", "Infrapanelek", "Bűzelzárók kádhoz", "| Mosógép |", "| Mosogató |", "| Bidé |", "Takaró elemek", "Festékek, tömítőanyagok, tisztítószerek, javítási kellékek", "Sarokszelepek", "Folyókák", "Kiegészítők | Szerelési kellékek", "Szerelési kellékek | Kiegészítők", "Kerti szelepek", "Rozetták és takaró elemek", "Kéziszerszámok és kiegészítők", "Nyomólapok", "Fali tartályok, rendszerek | Kiegészítők", "Fali tartályok rendszerek | Modulok | Szerelési kellékek | Kiegészítők", "Konyhai kiegészítők", "Aqualine konyhák");
28+
foreach($excludedItems as $item){
29+
if(str_contains($data, $item)) return false;
30+
}
31+
return true;
32+
break;
33+
34+
case 'product':
35+
$excludedItems = array("SITIA mosdótartó szekrény", "CIRASA Mosdótartó szekrény", "LARGO Mosdótartó szekrény", "SKA fiókos mosdótartó", "PUNO mosdótartó", "LUCIE Fiókos mosdótartó szekrény", "FERRO mosdótartó", "MORIAN mosdótartó", "VIERA mosdótartó", "CIMBURA mosdótartó", "ALTAIR mosdótartó", "VEGA mosdótartó szekrény", "ZOJA mosdótartó szekrény", "KERAMIA FRESH mosdótartó szekrény", "NEON mosdótartószekrény", "VEGA mosdótartó pult", "BRAND mosdótartó", "zuhanybox", );
36+
foreach($excludedItems as $item){
37+
if(str_contains($data, $item)) return false;
38+
}
39+
return true;
40+
break;
41+
42+
case 'serie':
43+
//$excludedItems = array("WOODY desky");
44+
if(str_contains($data, "WOODY desky")) return false;
45+
return true;
46+
break;
47+
default:
48+
return false;
49+
break;
50+
}
51+
}
52+
53+
$checkTags = array("weight_kg", "categoryText", "product", "serie");
54+
$outputFilePrefix = "output-";
55+
$chunker = new Chunker("saphoseed20230311123839361.xml", 1000, $outputFilePrefix, "passesValidation", $checkTags);
56+
$log = $chunker->chunkXML("shopItem", "Shop", "UTF-8");
57+
echo $log;
58+
59+
60+
?>
61+
</body>
62+
</html>

0 commit comments

Comments
 (0)