Aer Interpreter Source
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1021 lines
30 KiB

  1. #include "lib.h"
  2. /* Tokenize an entire XML input */
  3. static sxi32 XML_Tokenize(SyStream *pStream, SyToken *pToken, void *pUserData, void *pUnused2) {
  4. SyXMLParser *pParse = (SyXMLParser *)pUserData;
  5. SyString *pStr;
  6. sxi32 rc;
  7. int c;
  8. /* Jump leading white spaces */
  9. while(pStream->zText < pStream->zEnd && pStream->zText[0] < 0xc0 && SyisSpace(pStream->zText[0])) {
  10. /* Advance the stream cursor */
  11. if(pStream->zText[0] == '\n') {
  12. /* Increment line counter */
  13. pStream->nLine++;
  14. }
  15. pStream->zText++;
  16. }
  17. if(pStream->zText >= pStream->zEnd) {
  18. SXUNUSED(pUnused2);
  19. /* End of input reached */
  20. return SXERR_EOF;
  21. }
  22. /* Record token starting position and line */
  23. pToken->nLine = pStream->nLine;
  24. pToken->pUserData = 0;
  25. pStr = &pToken->sData;
  26. SyStringInitFromBuf(pStr, pStream->zText, 0);
  27. /* Extract the current token */
  28. c = pStream->zText[0];
  29. if(c == '<') {
  30. pStream->zText++;
  31. pStr->zString++;
  32. if(pStream->zText >= pStream->zEnd) {
  33. if(pParse->xError) {
  34. rc = pParse->xError("Illegal syntax,expecting valid start name character", SXML_ERROR_SYNTAX, pToken, pParse->pUserData);
  35. if(rc == SXERR_ABORT) {
  36. return SXERR_ABORT;
  37. }
  38. }
  39. /* End of input reached */
  40. return SXERR_EOF;
  41. }
  42. c = pStream->zText[0];
  43. if(c == '?') {
  44. /* Processing instruction */
  45. pStream->zText++;
  46. pStr->zString++;
  47. pToken->nType = SXML_TOK_PI;
  48. while(XLEX_IN_LEN(pStream) >= sizeof("?>") - 1 &&
  49. SyMemcmp((const void *)pStream->zText, "?>", sizeof("?>") - 1) != 0) {
  50. if(pStream->zText[0] == '\n') {
  51. /* Increment line counter */
  52. pStream->nLine++;
  53. }
  54. pStream->zText++;
  55. }
  56. /* Record token length */
  57. pStr->nByte = (sxu32)((const char *)pStream->zText - pStr->zString);
  58. if(XLEX_IN_LEN(pStream) < sizeof("?>") - 1) {
  59. if(pParse->xError) {
  60. rc = pParse->xError("End of input found,but processing instruction was not found", SXML_ERROR_UNCLOSED_TOKEN, pToken, pParse->pUserData);
  61. if(rc == SXERR_ABORT) {
  62. return SXERR_ABORT;
  63. }
  64. }
  65. return SXERR_EOF;
  66. }
  67. pStream->zText += sizeof("?>") - 1;
  68. } else if(c == '!') {
  69. pStream->zText++;
  70. if(XLEX_IN_LEN(pStream) >= sizeof("--") - 1 && pStream->zText[0] == '-' && pStream->zText[1] == '-') {
  71. /* Comment */
  72. pStream->zText += sizeof("--") - 1;
  73. while(XLEX_IN_LEN(pStream) >= sizeof("-->") - 1 &&
  74. SyMemcmp((const void *)pStream->zText, "-->", sizeof("-->") - 1) != 0) {
  75. if(pStream->zText[0] == '\n') {
  76. /* Increment line counter */
  77. pStream->nLine++;
  78. }
  79. pStream->zText++;
  80. }
  81. pStream->zText += sizeof("-->") - 1;
  82. /* Tell the lexer to ignore this token */
  83. return SXERR_CONTINUE;
  84. }
  85. if(XLEX_IN_LEN(pStream) >= sizeof("[CDATA[") - 1 && SyMemcmp((const void *)pStream->zText, "[CDATA[", sizeof("[CDATA[") - 1) == 0) {
  86. /* CDATA */
  87. pStream->zText += sizeof("[CDATA[") - 1;
  88. pStr->zString = (const char *)pStream->zText;
  89. while(XLEX_IN_LEN(pStream) >= sizeof("]]>") - 1 &&
  90. SyMemcmp((const void *)pStream->zText, "]]>", sizeof("]]>") - 1) != 0) {
  91. if(pStream->zText[0] == '\n') {
  92. /* Increment line counter */
  93. pStream->nLine++;
  94. }
  95. pStream->zText++;
  96. }
  97. /* Record token type and length */
  98. pStr->nByte = (sxu32)((const char *)pStream->zText - pStr->zString);
  99. pToken->nType = SXML_TOK_CDATA;
  100. if(XLEX_IN_LEN(pStream) < sizeof("]]>") - 1) {
  101. if(pParse->xError) {
  102. rc = pParse->xError("End of input found,but ]]> was not found", SXML_ERROR_UNCLOSED_TOKEN, pToken, pParse->pUserData);
  103. if(rc == SXERR_ABORT) {
  104. return SXERR_ABORT;
  105. }
  106. }
  107. return SXERR_EOF;
  108. }
  109. pStream->zText += sizeof("]]>") - 1;
  110. return SXRET_OK;
  111. }
  112. if(XLEX_IN_LEN(pStream) >= sizeof("DOCTYPE") - 1 && SyMemcmp((const void *)pStream->zText, "DOCTYPE", sizeof("DOCTYPE") - 1) == 0) {
  113. SyString sDelim = { ">", sizeof(char) }; /* Default delimiter */
  114. int c = 0;
  115. /* DOCTYPE */
  116. pStream->zText += sizeof("DOCTYPE") - 1;
  117. pStr->zString = (const char *)pStream->zText;
  118. /* Check for element declaration */
  119. while(pStream->zText < pStream->zEnd && pStream->zText[0] != '\n') {
  120. if(pStream->zText[0] >= 0xc0 || !SyisSpace(pStream->zText[0])) {
  121. c = pStream->zText[0];
  122. if(c == '>') {
  123. break;
  124. }
  125. }
  126. pStream->zText++;
  127. }
  128. if(c == '[') {
  129. /* Change the delimiter */
  130. SyStringInitFromBuf(&sDelim, "]>", sizeof("]>") - 1);
  131. }
  132. if(c != '>') {
  133. while(XLEX_IN_LEN(pStream) >= sDelim.nByte &&
  134. SyMemcmp((const void *)pStream->zText, sDelim.zString, sDelim.nByte) != 0) {
  135. if(pStream->zText[0] == '\n') {
  136. /* Increment line counter */
  137. pStream->nLine++;
  138. }
  139. pStream->zText++;
  140. }
  141. }
  142. /* Record token type and length */
  143. pStr->nByte = (sxu32)((const char *)pStream->zText - pStr->zString);
  144. pToken->nType = SXML_TOK_DOCTYPE;
  145. if(XLEX_IN_LEN(pStream) < sDelim.nByte) {
  146. if(pParse->xError) {
  147. rc = pParse->xError("End of input found,but ]> or > was not found", SXML_ERROR_UNCLOSED_TOKEN, pToken, pParse->pUserData);
  148. if(rc == SXERR_ABORT) {
  149. return SXERR_ABORT;
  150. }
  151. }
  152. return SXERR_EOF;
  153. }
  154. pStream->zText += sDelim.nByte;
  155. return SXRET_OK;
  156. }
  157. } else {
  158. int c;
  159. c = pStream->zText[0];
  160. rc = SXRET_OK;
  161. pToken->nType = SXML_TOK_START_TAG;
  162. if(c == '/') {
  163. /* End tag */
  164. pToken->nType = SXML_TOK_END_TAG;
  165. pStream->zText++;
  166. pStr->zString++;
  167. if(pStream->zText >= pStream->zEnd) {
  168. if(pParse->xError) {
  169. rc = pParse->xError("Illegal syntax,expecting valid start name character", SXML_ERROR_SYNTAX, pToken, pParse->pUserData);
  170. if(rc == SXERR_ABORT) {
  171. return SXERR_ABORT;
  172. }
  173. }
  174. return SXERR_EOF;
  175. }
  176. c = pStream->zText[0];
  177. }
  178. if(c == '>') {
  179. /*<>*/
  180. if(pParse->xError) {
  181. rc = pParse->xError("Illegal syntax,expecting valid start name character", SXML_ERROR_SYNTAX, pToken, pParse->pUserData);
  182. if(rc == SXERR_ABORT) {
  183. return SXERR_ABORT;
  184. }
  185. }
  186. /* Ignore the token */
  187. return SXERR_CONTINUE;
  188. }
  189. if(c < 0xc0 && (SyisSpace(c) || SyisDigit(c) || c == '.' || c == '-' || IS_XML_DIRTY(c))) {
  190. if(pParse->xError) {
  191. rc = pParse->xError("Illegal syntax,expecting valid start name character", SXML_ERROR_SYNTAX, pToken, pParse->pUserData);
  192. if(rc == SXERR_ABORT) {
  193. return SXERR_ABORT;
  194. }
  195. }
  196. rc = SXERR_INVALID;
  197. }
  198. pStream->zText++;
  199. /* Delimit the tag */
  200. while(pStream->zText < pStream->zEnd && pStream->zText[0] != '>') {
  201. c = pStream->zText[0];
  202. if(c >= 0xc0) {
  203. /* UTF-8 stream */
  204. pStream->zText++;
  205. SX_JMP_UTF8(pStream->zText, pStream->zEnd);
  206. } else {
  207. if(c == '/' && &pStream->zText[1] < pStream->zEnd && pStream->zText[1] == '>') {
  208. pStream->zText++;
  209. if(pToken->nType != SXML_TOK_START_TAG) {
  210. if(pParse->xError) {
  211. rc = pParse->xError("Unexpected closing tag,expecting '>'",
  212. SXML_ERROR_SYNTAX, pToken, pParse->pUserData);
  213. if(rc == SXERR_ABORT) {
  214. return SXERR_ABORT;
  215. }
  216. }
  217. /* Ignore the token */
  218. rc = SXERR_INVALID;
  219. } else {
  220. pToken->nType = SXML_TOK_START_END;
  221. }
  222. break;
  223. }
  224. if(pStream->zText[0] == '\n') {
  225. /* Increment line counter */
  226. pStream->nLine++;
  227. }
  228. /* Advance the stream cursor */
  229. pStream->zText++;
  230. }
  231. }
  232. if(rc != SXRET_OK) {
  233. /* Tell the lexer to ignore this token */
  234. return SXERR_CONTINUE;
  235. }
  236. /* Record token length */
  237. pStr->nByte = (sxu32)((const char *)pStream->zText - pStr->zString);
  238. if(pToken->nType == SXML_TOK_START_END && pStr->nByte > 0) {
  239. pStr->nByte -= sizeof(char);
  240. }
  241. if(pStream->zText < pStream->zEnd) {
  242. pStream->zText++;
  243. } else {
  244. if(pParse->xError) {
  245. rc = pParse->xError("End of input found,but closing tag '>' was not found", SXML_ERROR_UNCLOSED_TOKEN, pToken, pParse->pUserData);
  246. if(rc == SXERR_ABORT) {
  247. return SXERR_ABORT;
  248. }
  249. }
  250. }
  251. }
  252. } else {
  253. /* Raw input */
  254. while(pStream->zText < pStream->zEnd) {
  255. c = pStream->zText[0];
  256. if(c < 0xc0) {
  257. if(c == '<') {
  258. break;
  259. } else if(c == '\n') {
  260. /* Increment line counter */
  261. pStream->nLine++;
  262. }
  263. /* Advance the stream cursor */
  264. pStream->zText++;
  265. } else {
  266. /* UTF-8 stream */
  267. pStream->zText++;
  268. SX_JMP_UTF8(pStream->zText, pStream->zEnd);
  269. }
  270. }
  271. /* Record token type,length */
  272. pToken->nType = SXML_TOK_RAW;
  273. pStr->nByte = (sxu32)((const char *)pStream->zText - pStr->zString);
  274. }
  275. /* Return to the lexer */
  276. return SXRET_OK;
  277. }
  278. static int XMLCheckDuplicateAttr(SyXMLRawStr *aSet, sxu32 nEntry, SyXMLRawStr *pEntry) {
  279. sxu32 n;
  280. for(n = 0 ; n < nEntry ; n += 2) {
  281. SyXMLRawStr *pAttr = &aSet[n];
  282. if(pAttr->nByte == pEntry->nByte && SyMemcmp(pAttr->zString, pEntry->zString, pEntry->nByte) == 0) {
  283. /* Attribute found */
  284. return 1;
  285. }
  286. }
  287. /* No duplicates */
  288. return 0;
  289. }
  290. static sxi32 XMLProcessNamesSpace(SyXMLParser *pParse, SyXMLRawStrNS *pTag, SyToken *pToken, SySet *pAttr) {
  291. SyXMLRawStr *pPrefix, *pUri; /* Namespace prefix/URI */
  292. SyHashEntry *pEntry;
  293. SyXMLRawStr *pDup;
  294. sxi32 rc;
  295. /* Extract the URI first */
  296. pUri = (SyXMLRawStr *)SySetPeek(pAttr);
  297. /* Extract the prefix */
  298. pPrefix = (SyXMLRawStr *)SySetAt(pAttr, SySetUsed(pAttr) - 2);
  299. /* Prefix name */
  300. if(pPrefix->nByte == sizeof("xmlns") - 1) {
  301. /* Default namespace */
  302. pPrefix->nByte = 0;
  303. pPrefix->zString = ""; /* Empty string */
  304. } else {
  305. pPrefix->nByte -= sizeof("xmlns") - 1;
  306. pPrefix->zString += sizeof("xmlns") - 1;
  307. if(pPrefix->zString[0] != ':') {
  308. return SXRET_OK;
  309. }
  310. pPrefix->nByte--;
  311. pPrefix->zString++;
  312. if(pPrefix->nByte < 1) {
  313. if(pParse->xError) {
  314. rc = pParse->xError("Invalid namespace name", SXML_ERROR_SYNTAX, pToken, pParse->pUserData);
  315. if(rc == SXERR_ABORT) {
  316. return SXERR_ABORT;
  317. }
  318. }
  319. /* POP the last insertred two entries */
  320. (void)SySetPop(pAttr);
  321. (void)SySetPop(pAttr);
  322. return SXERR_SYNTAX;
  323. }
  324. }
  325. /* Invoke the namespace callback if available */
  326. if(pParse->xNameSpace) {
  327. rc = pParse->xNameSpace(pPrefix, pUri, pParse->pUserData);
  328. if(rc == SXERR_ABORT) {
  329. /* User callback request an operation abort */
  330. return SXERR_ABORT;
  331. }
  332. }
  333. /* Duplicate structure */
  334. pDup = (SyXMLRawStr *)SyMemBackendAlloc(pParse->pAllocator, sizeof(SyXMLRawStr));
  335. if(pDup == 0) {
  336. if(pParse->xError) {
  337. pParse->xError("Out of memory", SXML_ERROR_NO_MEMORY, pToken, pParse->pUserData);
  338. }
  339. /* Abort processing immediately */
  340. return SXERR_ABORT;
  341. }
  342. *pDup = *pUri; /* Structure assignment */
  343. /* Save the namespace */
  344. if(pPrefix->nByte == 0) {
  345. pPrefix->zString = "Default";
  346. pPrefix->nByte = sizeof("Default") - 1;
  347. }
  348. SyHashInsert(&pParse->hns, (const void *)pPrefix->zString, pPrefix->nByte, pDup);
  349. /* Peek the last inserted entry */
  350. pEntry = SyHashLastEntry(&pParse->hns);
  351. /* Store in the corresponding tag container*/
  352. SySetPut(&pTag->sNSset, (const void *)&pEntry);
  353. /* POP the last insertred two entries */
  354. (void)SySetPop(pAttr);
  355. (void)SySetPop(pAttr);
  356. return SXRET_OK;
  357. }
  358. static sxi32 XMLProcessStartTag(SyXMLParser *pParse, SyToken *pToken, SyXMLRawStrNS *pTag, SySet *pAttrSet, SySet *pTagStack) {
  359. SyString *pIn = &pToken->sData;
  360. const char *zIn, *zCur, *zEnd;
  361. SyXMLRawStr sEntry;
  362. sxi32 rc;
  363. int c;
  364. /* Reset the working set */
  365. SySetReset(pAttrSet);
  366. /* Delimit the raw tag */
  367. zIn = pIn->zString;
  368. zEnd = &zIn[pIn->nByte];
  369. while(zIn < zEnd && (unsigned char)zIn[0] < 0xc0 && SyisSpace(zIn[0])) {
  370. zIn++;
  371. }
  372. /* Isolate tag name */
  373. sEntry.nLine = pTag->nLine = pToken->nLine;
  374. zCur = zIn;
  375. while(zIn < zEnd) {
  376. if((unsigned char)zIn[0] >= 0xc0) {
  377. /* UTF-8 stream */
  378. zIn++;
  379. SX_JMP_UTF8(zIn, zEnd);
  380. } else if(SyisSpace(zIn[0])) {
  381. break;
  382. } else {
  383. if(IS_XML_DIRTY(zIn[0])) {
  384. if(pParse->xError) {
  385. rc = pParse->xError("Illegal character in XML name", SXML_ERROR_SYNTAX, pToken, pParse->pUserData);
  386. if(rc == SXERR_ABORT) {
  387. return SXERR_ABORT;
  388. }
  389. }
  390. }
  391. zIn++;
  392. }
  393. }
  394. if(zCur >= zIn) {
  395. if(pParse->xError) {
  396. rc = pParse->xError("Invalid XML name", SXML_ERROR_SYNTAX, pToken, pParse->pUserData);
  397. if(rc == SXERR_ABORT) {
  398. return SXERR_ABORT;
  399. }
  400. }
  401. return SXERR_SYNTAX;
  402. }
  403. pTag->zString = zCur;
  404. pTag->nByte = (sxu32)(zIn - zCur);
  405. /* Process tag attribute */
  406. for(;;) {
  407. int is_ns = 0;
  408. while(zIn < zEnd && (unsigned char)zIn[0] < 0xc0 && SyisSpace(zIn[0])) {
  409. zIn++;
  410. }
  411. if(zIn >= zEnd) {
  412. break;
  413. }
  414. zCur = zIn;
  415. while(zIn < zEnd && zIn[0] != '=') {
  416. if((unsigned char)zIn[0] >= 0xc0) {
  417. /* UTF-8 stream */
  418. zIn++;
  419. SX_JMP_UTF8(zIn, zEnd);
  420. } else if(SyisSpace(zIn[0])) {
  421. break;
  422. } else {
  423. zIn++;
  424. }
  425. }
  426. if(zCur >= zIn) {
  427. if(pParse->xError) {
  428. rc = pParse->xError("Missing attribute name", SXML_ERROR_SYNTAX, pToken, pParse->pUserData);
  429. if(rc == SXERR_ABORT) {
  430. return SXERR_ABORT;
  431. }
  432. }
  433. return SXERR_SYNTAX;
  434. }
  435. /* Store attribute name */
  436. sEntry.zString = zCur;
  437. sEntry.nByte = (sxu32)(zIn - zCur);
  438. if((pParse->nFlags & SXML_ENABLE_NAMESPACE) && sEntry.nByte >= sizeof("xmlns") - 1 &&
  439. SyMemcmp(sEntry.zString, "xmlns", sizeof("xmlns") - 1) == 0) {
  440. is_ns = 1;
  441. }
  442. while(zIn < zEnd && (unsigned char)zIn[0] < 0xc0 && SyisSpace(zIn[0])) {
  443. zIn++;
  444. }
  445. if(zIn >= zEnd || zIn[0] != '=') {
  446. if(pParse->xError) {
  447. rc = pParse->xError("Missing attribute value", SXML_ERROR_SYNTAX, pToken, pParse->pUserData);
  448. if(rc == SXERR_ABORT) {
  449. return SXERR_ABORT;
  450. }
  451. }
  452. return SXERR_SYNTAX;
  453. }
  454. while(sEntry.nByte > 0 && (unsigned char)zCur[sEntry.nByte - 1] < 0xc0
  455. && SyisSpace(zCur[sEntry.nByte - 1])) {
  456. sEntry.nByte--;
  457. }
  458. /* Check for duplicates first */
  459. if(XMLCheckDuplicateAttr((SyXMLRawStr *)SySetBasePtr(pAttrSet), SySetUsed(pAttrSet), &sEntry)) {
  460. if(pParse->xError) {
  461. rc = pParse->xError("Duplicate attribute", SXML_ERROR_DUPLICATE_ATTRIBUTE, pToken, pParse->pUserData);
  462. if(rc == SXERR_ABORT) {
  463. return SXERR_ABORT;
  464. }
  465. }
  466. return SXERR_SYNTAX;
  467. }
  468. if(SXRET_OK != SySetPut(pAttrSet, (const void *)&sEntry)) {
  469. return SXERR_ABORT;
  470. }
  471. /* Extract attribute value */
  472. zIn++; /* Jump the trailing '=' */
  473. while(zIn < zEnd && (unsigned char)zIn[0] < 0xc0 && SyisSpace(zIn[0])) {
  474. zIn++;
  475. }
  476. if(zIn >= zEnd) {
  477. if(pParse->xError) {
  478. rc = pParse->xError("Missing attribute value", SXML_ERROR_SYNTAX, pToken, pParse->pUserData);
  479. if(rc == SXERR_ABORT) {
  480. return SXERR_ABORT;
  481. }
  482. }
  483. (void)SySetPop(pAttrSet);
  484. return SXERR_SYNTAX;
  485. }
  486. if(zIn[0] != '\'' && zIn[0] != '"') {
  487. if(pParse->xError) {
  488. rc = pParse->xError("Missing quotes on attribute value", SXML_ERROR_SYNTAX, pToken, pParse->pUserData);
  489. if(rc == SXERR_ABORT) {
  490. return SXERR_ABORT;
  491. }
  492. }
  493. (void)SySetPop(pAttrSet);
  494. return SXERR_SYNTAX;
  495. }
  496. c = zIn[0];
  497. zIn++;
  498. zCur = zIn;
  499. while(zIn < zEnd && zIn[0] != c) {
  500. zIn++;
  501. }
  502. if(zIn >= zEnd) {
  503. if(pParse->xError) {
  504. rc = pParse->xError("Missing quotes on attribute value", SXML_ERROR_SYNTAX, pToken, pParse->pUserData);
  505. if(rc == SXERR_ABORT) {
  506. return SXERR_ABORT;
  507. }
  508. }
  509. (void)SySetPop(pAttrSet);
  510. return SXERR_SYNTAX;
  511. }
  512. /* Store attribute value */
  513. sEntry.zString = zCur;
  514. sEntry.nByte = (sxu32)(zIn - zCur);
  515. if(SXRET_OK != SySetPut(pAttrSet, (const void *)&sEntry)) {
  516. return SXERR_ABORT;
  517. }
  518. zIn++;
  519. if(is_ns) {
  520. /* Process namespace declaration */
  521. XMLProcessNamesSpace(pParse, pTag, pToken, pAttrSet);
  522. }
  523. }
  524. /* Store in the tag stack */
  525. if(pToken->nType == SXML_TOK_START_TAG) {
  526. rc = SySetPut(pTagStack, (const void *)pTag);
  527. }
  528. return SXRET_OK;
  529. }
  530. static void XMLExtactPI(SyToken *pToken, SyXMLRawStr *pTarget, SyXMLRawStr *pData, int *pXML) {
  531. SyString *pIn = &pToken->sData;
  532. const char *zIn, *zCur, *zEnd;
  533. pTarget->nLine = pData->nLine = pToken->nLine;
  534. /* Nullify the entries first */
  535. pTarget->zString = pData->zString = 0;
  536. /* Ignore leading and trailing white spaces */
  537. SyStringFullTrim(pIn);
  538. /* Delimit the raw PI */
  539. zIn = pIn->zString;
  540. zEnd = &zIn[pIn->nByte];
  541. if(pXML) {
  542. *pXML = 0;
  543. }
  544. /* Extract the target */
  545. zCur = zIn;
  546. while(zIn < zEnd) {
  547. if((unsigned char)zIn[0] >= 0xc0) {
  548. /* UTF-8 stream */
  549. zIn++;
  550. SX_JMP_UTF8(zIn, zEnd);
  551. } else if(SyisSpace(zIn[0])) {
  552. break;
  553. } else {
  554. zIn++;
  555. }
  556. }
  557. if(zIn > zCur) {
  558. pTarget->zString = zCur;
  559. pTarget->nByte = (sxu32)(zIn - zCur);
  560. if(pXML && pTarget->nByte == sizeof("xml") - 1 && SyStrnicmp(pTarget->zString, "xml", sizeof("xml") - 1) == 0) {
  561. *pXML = 1;
  562. }
  563. }
  564. /* Extract the PI data */
  565. while(zIn < zEnd && (unsigned char)zIn[0] < 0xc0 && SyisSpace(zIn[0])) {
  566. zIn++;
  567. }
  568. if(zIn < zEnd) {
  569. pData->zString = zIn;
  570. pData->nByte = (sxu32)(zEnd - zIn);
  571. }
  572. }
  573. static sxi32 XMLExtractEndTag(SyXMLParser *pParse, SyToken *pToken, SyXMLRawStrNS *pOut) {
  574. SyString *pIn = &pToken->sData;
  575. const char *zEnd = &pIn->zString[pIn->nByte];
  576. const char *zIn = pIn->zString;
  577. /* Ignore leading white spaces */
  578. while(zIn < zEnd && (unsigned char)zIn[0] < 0xc0 && SyisSpace(zIn[0])) {
  579. zIn++;
  580. }
  581. pOut->nLine = pToken->nLine;
  582. pOut->zString = zIn;
  583. pOut->nByte = (sxu32)(zEnd - zIn);
  584. /* Ignore trailing white spaces */
  585. while(pOut->nByte > 0 && (unsigned char)pOut->zString[pOut->nByte - 1] < 0xc0
  586. && SyisSpace(pOut->zString[pOut->nByte - 1])) {
  587. pOut->nByte--;
  588. }
  589. if(pOut->nByte < 1) {
  590. if(pParse->xError) {
  591. sxi32 rc;
  592. rc = pParse->xError("Invalid end tag name", SXML_ERROR_INVALID_TOKEN, pToken, pParse->pUserData);
  593. if(rc == SXERR_ABORT) {
  594. return SXERR_ABORT;
  595. }
  596. }
  597. return SXERR_SYNTAX;
  598. }
  599. return SXRET_OK;
  600. }
  601. static void TokenToXMLString(SyToken *pTok, SyXMLRawStrNS *pOut) {
  602. /* Remove leading and trailing white spaces first */
  603. SyStringFullTrim(&pTok->sData);
  604. pOut->zString = SyStringData(&pTok->sData);
  605. pOut->nByte = SyStringLength(&pTok->sData);
  606. }
  607. static sxi32 XMLExtractNS(SyXMLParser *pParse, SyToken *pToken, SyXMLRawStrNS *pTag, SyXMLRawStr *pnsUri) {
  608. SyXMLRawStr *pUri, sPrefix;
  609. SyHashEntry *pEntry;
  610. sxu32 nOfft;
  611. sxi32 rc;
  612. /* Extract a prefix if available */
  613. rc = SyByteFind(pTag->zString, pTag->nByte, ':', &nOfft);
  614. if(rc != SXRET_OK) {
  615. /* Check if there is a default namespace */
  616. pEntry = SyHashGet(&pParse->hns, "Default", sizeof("Default") - 1);
  617. if(pEntry) {
  618. /* Extract the ns URI */
  619. pUri = (SyXMLRawStr *)pEntry->pUserData;
  620. /* Save the ns URI */
  621. pnsUri->zString = pUri->zString;
  622. pnsUri->nByte = pUri->nByte;
  623. }
  624. return SXRET_OK;
  625. }
  626. if(nOfft < 1) {
  627. if(pParse->xError) {
  628. rc = pParse->xError("Empty prefix is not allowed according to XML namespace specification",
  629. SXML_ERROR_SYNTAX, pToken, pParse->pUserData);
  630. if(rc == SXERR_ABORT) {
  631. return SXERR_ABORT;
  632. }
  633. }
  634. return SXERR_SYNTAX;
  635. }
  636. sPrefix.zString = pTag->zString;
  637. sPrefix.nByte = nOfft;
  638. sPrefix.nLine = pTag->nLine;
  639. pTag->zString += nOfft + 1;
  640. pTag->nByte -= nOfft;
  641. if(pTag->nByte < 1) {
  642. if(pParse->xError) {
  643. rc = pParse->xError("Missing tag name", SXML_ERROR_SYNTAX, pToken, pParse->pUserData);
  644. if(rc == SXERR_ABORT) {
  645. return SXERR_ABORT;
  646. }
  647. }
  648. return SXERR_SYNTAX;
  649. }
  650. /* Check if the prefix is already registered */
  651. pEntry = SyHashGet(&pParse->hns, sPrefix.zString, sPrefix.nByte);
  652. if(pEntry == 0) {
  653. if(pParse->xError) {
  654. rc = pParse->xError("Namespace prefix is not defined", SXML_ERROR_SYNTAX,
  655. pToken, pParse->pUserData);
  656. if(rc == SXERR_ABORT) {
  657. return SXERR_ABORT;
  658. }
  659. }
  660. return SXERR_SYNTAX;
  661. }
  662. /* Extract the ns URI */
  663. pUri = (SyXMLRawStr *)pEntry->pUserData;
  664. /* Save the ns URI */
  665. pnsUri->zString = pUri->zString;
  666. pnsUri->nByte = pUri->nByte;
  667. /* All done */
  668. return SXRET_OK;
  669. }
  670. static sxi32 XMLnsUnlink(SyXMLParser *pParse, SyXMLRawStrNS *pLast, SyToken *pToken) {
  671. SyHashEntry **apEntry, *pEntry;
  672. void *pUserData;
  673. sxu32 n;
  674. /* Release namespace entries */
  675. apEntry = (SyHashEntry **)SySetBasePtr(&pLast->sNSset);
  676. for(n = 0 ; n < SySetUsed(&pLast->sNSset) ; ++n) {
  677. pEntry = apEntry[n];
  678. /* Invoke the end namespace declaration callback */
  679. if(pParse->xNameSpaceEnd && (pParse->nFlags & SXML_ENABLE_NAMESPACE) && pToken) {
  680. SyXMLRawStr sPrefix;
  681. sxi32 rc;
  682. sPrefix.zString = (const char *)pEntry->pKey;
  683. sPrefix.nByte = pEntry->nKeyLen;
  684. sPrefix.nLine = pToken->nLine;
  685. rc = pParse->xNameSpaceEnd(&sPrefix, pParse->pUserData);
  686. if(rc == SXERR_ABORT) {
  687. return SXERR_ABORT;
  688. }
  689. }
  690. pUserData = pEntry->pUserData;
  691. /* Remove from the namespace hashtable */
  692. SyHashDeleteEntry2(pEntry);
  693. SyMemBackendFree(pParse->pAllocator, pUserData);
  694. }
  695. SySetRelease(&pLast->sNSset);
  696. return SXRET_OK;
  697. }
  698. /* Process XML tokens */
  699. static sxi32 ProcessXML(SyXMLParser *pParse, SySet *pTagStack, SySet *pWorker) {
  700. SySet *pTokenSet = &pParse->sToken;
  701. SyXMLRawStrNS sEntry;
  702. SyXMLRawStr sNs;
  703. SyToken *pToken;
  704. int bGotTag;
  705. sxi32 rc;
  706. /* Initialize fields */
  707. bGotTag = 0;
  708. /* Start processing */
  709. if(pParse->xStartDoc && (SXERR_ABORT == pParse->xStartDoc(pParse->pUserData))) {
  710. /* User callback request an operation abort */
  711. return SXERR_ABORT;
  712. }
  713. /* Reset the loop cursor */
  714. SySetResetCursor(pTokenSet);
  715. /* Extract the current token */
  716. while(SXRET_OK == (SySetGetNextEntry(&(*pTokenSet), (void **)&pToken))) {
  717. SyZero(&sEntry, sizeof(SyXMLRawStrNS));
  718. SyZero(&sNs, sizeof(SyXMLRawStr));
  719. SySetInit(&sEntry.sNSset, pParse->pAllocator, sizeof(SyHashEntry *));
  720. sEntry.nLine = sNs.nLine = pToken->nLine;
  721. switch(pToken->nType) {
  722. case SXML_TOK_DOCTYPE:
  723. if(SySetUsed(pTagStack) > 1 || bGotTag) {
  724. if(pParse->xError) {
  725. rc = pParse->xError("DOCTYPE must be declared first", SXML_ERROR_MISPLACED_XML_PI, pToken, pParse->pUserData);
  726. if(rc == SXERR_ABORT) {
  727. return SXERR_ABORT;
  728. }
  729. }
  730. break;
  731. }
  732. /* Invoke the supplied callback if any */
  733. if(pParse->xDoctype) {
  734. TokenToXMLString(pToken, &sEntry);
  735. rc = pParse->xDoctype((SyXMLRawStr *)&sEntry, pParse->pUserData);
  736. if(rc == SXERR_ABORT) {
  737. return SXERR_ABORT;
  738. }
  739. }
  740. break;
  741. case SXML_TOK_CDATA:
  742. if(SySetUsed(pTagStack) < 1) {
  743. if(pParse->xError) {
  744. rc = pParse->xError("CDATA without matching tag", SXML_ERROR_TAG_MISMATCH, pToken, pParse->pUserData);
  745. if(rc == SXERR_ABORT) {
  746. return SXERR_ABORT;
  747. }
  748. }
  749. }
  750. /* Invoke the supplied callback if any */
  751. if(pParse->xRaw) {
  752. TokenToXMLString(pToken, &sEntry);
  753. rc = pParse->xRaw((SyXMLRawStr *)&sEntry, pParse->pUserData);
  754. if(rc == SXERR_ABORT) {
  755. return SXERR_ABORT;
  756. }
  757. }
  758. break;
  759. case SXML_TOK_PI: {
  760. SyXMLRawStr sTarget, sData;
  761. int isXML = 0;
  762. /* Extract the target and data */
  763. XMLExtactPI(pToken, &sTarget, &sData, &isXML);
  764. if(isXML && SySetCursor(pTokenSet) - 1 > 0) {
  765. if(pParse->xError) {
  766. rc = pParse->xError("Unexpected XML declaration. The XML declaration must be the first node in the document",
  767. SXML_ERROR_MISPLACED_XML_PI, pToken, pParse->pUserData);
  768. if(rc == SXERR_ABORT) {
  769. return SXERR_ABORT;
  770. }
  771. }
  772. } else if(pParse->xPi) {
  773. /* Invoke the supplied callback*/
  774. rc = pParse->xPi(&sTarget, &sData, pParse->pUserData);
  775. if(rc == SXERR_ABORT) {
  776. return SXERR_ABORT;
  777. }
  778. }
  779. break;
  780. }
  781. case SXML_TOK_RAW:
  782. if(SySetUsed(pTagStack) < 1) {
  783. if(pParse->xError) {
  784. rc = pParse->xError("Text (Raw data) without matching tag", SXML_ERROR_TAG_MISMATCH, pToken, pParse->pUserData);
  785. if(rc == SXERR_ABORT) {
  786. return SXERR_ABORT;
  787. }
  788. }
  789. break;
  790. }
  791. /* Invoke the supplied callback if any */
  792. if(pParse->xRaw) {
  793. TokenToXMLString(pToken, &sEntry);
  794. rc = pParse->xRaw((SyXMLRawStr *)&sEntry, pParse->pUserData);
  795. if(rc == SXERR_ABORT) {
  796. return SXERR_ABORT;
  797. }
  798. }
  799. break;
  800. case SXML_TOK_END_TAG: {
  801. SyXMLRawStrNS *pLast = 0; /* cc warning */
  802. if(SySetUsed(pTagStack) < 1) {
  803. if(pParse->xError) {
  804. rc = pParse->xError("Unexpected closing tag", SXML_ERROR_TAG_MISMATCH, pToken, pParse->pUserData);
  805. if(rc == SXERR_ABORT) {
  806. return SXERR_ABORT;
  807. }
  808. }
  809. break;
  810. }
  811. rc = XMLExtractEndTag(pParse, pToken, &sEntry);
  812. if(rc == SXRET_OK) {
  813. /* Extract the last inserted entry */
  814. pLast = (SyXMLRawStrNS *)SySetPeek(pTagStack);
  815. if(pLast == 0 || pLast->nByte != sEntry.nByte ||
  816. SyMemcmp(pLast->zString, sEntry.zString, sEntry.nByte) != 0) {
  817. if(pParse->xError) {
  818. rc = pParse->xError("Unexpected closing tag", SXML_ERROR_TAG_MISMATCH, pToken, pParse->pUserData);
  819. if(rc == SXERR_ABORT) {
  820. return SXERR_ABORT;
  821. }
  822. }
  823. } else {
  824. /* Invoke the supplied callback if any */
  825. if(pParse->xEndTag) {
  826. rc = SXRET_OK;
  827. if(pParse->nFlags & SXML_ENABLE_NAMESPACE) {
  828. /* Extract namespace URI */
  829. rc = XMLExtractNS(pParse, pToken, &sEntry, &sNs);
  830. if(rc == SXERR_ABORT) {
  831. return SXERR_ABORT;
  832. }
  833. }
  834. if(rc == SXRET_OK) {
  835. rc = pParse->xEndTag((SyXMLRawStr *)&sEntry, &sNs, pParse->pUserData);
  836. if(rc == SXERR_ABORT) {
  837. return SXERR_ABORT;
  838. }
  839. }
  840. }
  841. }
  842. } else if(rc == SXERR_ABORT) {
  843. return SXERR_ABORT;
  844. }
  845. if(pLast) {
  846. rc = XMLnsUnlink(pParse, pLast, pToken);
  847. (void)SySetPop(pTagStack);
  848. if(rc == SXERR_ABORT) {
  849. return SXERR_ABORT;
  850. }
  851. }
  852. break;
  853. }
  854. case SXML_TOK_START_TAG:
  855. case SXML_TOK_START_END:
  856. if(SySetUsed(pTagStack) < 1 && bGotTag) {
  857. if(pParse->xError) {
  858. rc = pParse->xError("XML document cannot contain multiple root level elements documents",
  859. SXML_ERROR_SYNTAX, pToken, pParse->pUserData);
  860. if(rc == SXERR_ABORT) {
  861. return SXERR_ABORT;
  862. }
  863. }
  864. break;
  865. }
  866. bGotTag = 1;
  867. /* Extract the tag and it's supplied attribute */
  868. rc = XMLProcessStartTag(pParse, pToken, &sEntry, pWorker, pTagStack);
  869. if(rc == SXRET_OK) {
  870. if(pParse->nFlags & SXML_ENABLE_NAMESPACE) {
  871. /* Extract namespace URI */
  872. rc = XMLExtractNS(pParse, pToken, &sEntry, &sNs);
  873. }
  874. }
  875. if(rc == SXRET_OK) {
  876. /* Invoke the supplied callback */
  877. if(pParse->xStartTag) {
  878. rc = pParse->xStartTag((SyXMLRawStr *)&sEntry, &sNs, SySetUsed(pWorker),
  879. (SyXMLRawStr *)SySetBasePtr(pWorker), pParse->pUserData);
  880. if(rc == SXERR_ABORT) {
  881. return SXERR_ABORT;
  882. }
  883. }
  884. if(pToken->nType == SXML_TOK_START_END) {
  885. if(pParse->xEndTag) {
  886. rc = pParse->xEndTag((SyXMLRawStr *)&sEntry, &sNs, pParse->pUserData);
  887. if(rc == SXERR_ABORT) {
  888. return SXERR_ABORT;
  889. }
  890. }
  891. rc = XMLnsUnlink(pParse, &sEntry, pToken);
  892. if(rc == SXERR_ABORT) {
  893. return SXERR_ABORT;
  894. }
  895. }
  896. } else if(rc == SXERR_ABORT) {
  897. /* Abort processing immediately */
  898. return SXERR_ABORT;
  899. }
  900. break;
  901. default:
  902. /* Can't happen */
  903. break;
  904. }
  905. }
  906. if(SySetUsed(pTagStack) > 0 && pParse->xError) {
  907. pParse->xError("Missing closing tag", SXML_ERROR_SYNTAX,
  908. (SyToken *)SySetPeek(&pParse->sToken), pParse->pUserData);
  909. }
  910. if(pParse->xEndDoc) {
  911. pParse->xEndDoc(pParse->pUserData);
  912. }
  913. return SXRET_OK;
  914. }
  915. PH7_PRIVATE sxi32 SyXMLParserInit(SyXMLParser *pParser, SyMemBackend *pAllocator, sxi32 iFlags) {
  916. /* Zero the structure first */
  917. SyZero(pParser, sizeof(SyXMLParser));
  918. /* Initialize fields */
  919. SySetInit(&pParser->sToken, pAllocator, sizeof(SyToken));
  920. SyLexInit(&pParser->sLex, &pParser->sToken, XML_Tokenize, pParser);
  921. SyHashInit(&pParser->hns, pAllocator, 0, 0);
  922. pParser->pAllocator = pAllocator;
  923. pParser->nFlags = iFlags;
  924. return SXRET_OK;
  925. }
  926. PH7_PRIVATE sxi32 SyXMLParserSetEventHandler(SyXMLParser *pParser,
  927. void *pUserData,
  928. ProcXMLStartTagHandler xStartTag,
  929. ProcXMLTextHandler xRaw,
  930. ProcXMLSyntaxErrorHandler xErr,
  931. ProcXMLStartDocument xStartDoc,
  932. ProcXMLEndTagHandler xEndTag,
  933. ProcXMLPIHandler xPi,
  934. ProcXMLEndDocument xEndDoc,
  935. ProcXMLDoctypeHandler xDoctype,
  936. ProcXMLNameSpaceStart xNameSpace,
  937. ProcXMLNameSpaceEnd xNameSpaceEnd
  938. ) {
  939. /* Install user callbacks */
  940. if(xErr) {
  941. pParser->xError = xErr;
  942. }
  943. if(xStartDoc) {
  944. pParser->xStartDoc = xStartDoc;
  945. }
  946. if(xStartTag) {
  947. pParser->xStartTag = xStartTag;
  948. }
  949. if(xRaw) {
  950. pParser->xRaw = xRaw;
  951. }
  952. if(xEndTag) {
  953. pParser->xEndTag = xEndTag;
  954. }
  955. if(xPi) {
  956. pParser->xPi = xPi;
  957. }
  958. if(xEndDoc) {
  959. pParser->xEndDoc = xEndDoc;
  960. }
  961. if(xDoctype) {
  962. pParser->xDoctype = xDoctype;
  963. }
  964. if(xNameSpace) {
  965. pParser->xNameSpace = xNameSpace;
  966. }
  967. if(xNameSpaceEnd) {
  968. pParser->xNameSpaceEnd = xNameSpaceEnd;
  969. }
  970. pParser->pUserData = pUserData;
  971. return SXRET_OK;
  972. }
  973. /* Process an XML chunk */
  974. PH7_PRIVATE sxi32 SyXMLProcess(SyXMLParser *pParser, const char *zInput, sxu32 nByte) {
  975. SySet sTagStack;
  976. SySet sWorker;
  977. sxi32 rc;
  978. /* Initialize working sets */
  979. SySetInit(&sWorker, pParser->pAllocator, sizeof(SyXMLRawStr)); /* Tag container */
  980. SySetInit(&sTagStack, pParser->pAllocator, sizeof(SyXMLRawStrNS)); /* Tag stack */
  981. /* Tokenize the entire input */
  982. rc = SyLexTokenizeInput(&pParser->sLex, zInput, nByte, 0, 0, 0);
  983. if(rc == SXERR_ABORT) {
  984. /* Tokenize callback request an operation abort */
  985. return SXERR_ABORT;
  986. }
  987. if(SySetUsed(&pParser->sToken) < 1) {
  988. /* Nothing to process [i.e: white spaces] */
  989. rc = SXRET_OK;
  990. } else {
  991. /* Process XML Tokens */
  992. rc = ProcessXML(&(*pParser), &sTagStack, &sWorker);
  993. if(pParser->nFlags & SXML_ENABLE_NAMESPACE) {
  994. if(SySetUsed(&sTagStack) > 0) {
  995. SyXMLRawStrNS *pEntry;
  996. SyHashEntry **apEntry;
  997. sxu32 n;
  998. SySetResetCursor(&sTagStack);
  999. while(SySetGetNextEntry(&sTagStack, (void **)&pEntry) == SXRET_OK) {
  1000. /* Release namespace entries */
  1001. apEntry = (SyHashEntry **)SySetBasePtr(&pEntry->sNSset);
  1002. for(n = 0 ; n < SySetUsed(&pEntry->sNSset) ; ++n) {
  1003. SyMemBackendFree(pParser->pAllocator, apEntry[n]->pUserData);
  1004. }
  1005. SySetRelease(&pEntry->sNSset);
  1006. }
  1007. }
  1008. }
  1009. }
  1010. /* Clean-up the mess left behind */
  1011. SySetRelease(&sWorker);
  1012. SySetRelease(&sTagStack);
  1013. /* Processing result */
  1014. return rc;
  1015. }
  1016. PH7_PRIVATE sxi32 SyXMLParserRelease(SyXMLParser *pParser) {
  1017. SyLexRelease(&pParser->sLex);
  1018. SySetRelease(&pParser->sToken);
  1019. SyHashRelease(&pParser->hns);
  1020. return SXRET_OK;
  1021. }