Reorganisation
This commit is contained in:
895
lexer.c
Normal file
895
lexer.c
Normal file
@@ -0,0 +1,895 @@
|
||||
/*
|
||||
* Symisc PH7: An embeddable bytecode compiler and a virtual machine for the PHP(5) programming language.
|
||||
* Copyright (C) 2011-2012, Symisc Systems http://ph7.symisc.net/
|
||||
* Version 2.1.4
|
||||
* For information on licensing,redistribution of this file,and for a DISCLAIMER OF ALL WARRANTIES
|
||||
* please contact Symisc Systems via:
|
||||
* legal@symisc.net
|
||||
* licensing@symisc.net
|
||||
* contact@symisc.net
|
||||
* or visit:
|
||||
* http://ph7.symisc.net/
|
||||
*/
|
||||
/* $SymiscID: lex.c v2.8 Ubuntu-linux 2012-07-13 01:21 stable <chm@symisc.net> $ */
|
||||
#include "ph7int.h"
|
||||
/*
|
||||
* This file implement an efficient hand-coded,thread-safe and full-reentrant
|
||||
* lexical analyzer/Tokenizer for the PH7 engine.
|
||||
*/
|
||||
/* Forward declaration */
|
||||
static sxu32 KeywordCode(const char *z, int n);
|
||||
/*
|
||||
* Tokenize a raw PHP input.
|
||||
* Get a single low-level token from the input file. Update the stream pointer so that
|
||||
* it points to the first character beyond the extracted token.
|
||||
*/
|
||||
static sxi32 TokenizePHP(SyStream *pStream,SyToken *pToken,void *pUserData,void *pCtxData)
|
||||
{
|
||||
SyString *pStr;
|
||||
sxi32 rc;
|
||||
/* Ignore leading white spaces */
|
||||
while( pStream->zText < pStream->zEnd && pStream->zText[0] < 0xc0 && SyisSpace(pStream->zText[0]) ){
|
||||
/* Advance the stream cursor */
|
||||
if( pStream->zText[0] == '\n' ){
|
||||
/* Update line counter */
|
||||
pStream->nLine++;
|
||||
}
|
||||
pStream->zText++;
|
||||
}
|
||||
if( pStream->zText >= pStream->zEnd ){
|
||||
/* End of input reached */
|
||||
return SXERR_EOF;
|
||||
}
|
||||
/* Record token starting position and line */
|
||||
pToken->nLine = pStream->nLine;
|
||||
pToken->pUserData = 0;
|
||||
pStr = &pToken->sData;
|
||||
SyStringInitFromBuf(pStr,pStream->zText,0);
|
||||
if( pStream->zText[0] >= 0xc0 || SyisAlpha(pStream->zText[0]) || pStream->zText[0] == '_' ){
|
||||
/* The following code fragment is taken verbatim from the xPP source tree.
|
||||
* xPP is a modern embeddable macro processor with advanced features useful for
|
||||
* application seeking for a production quality,ready to use macro processor.
|
||||
* xPP is a widely used library developed and maintened by Symisc Systems.
|
||||
* You can reach the xPP home page by following this link:
|
||||
* http://xpp.symisc.net/
|
||||
*/
|
||||
const unsigned char *zIn;
|
||||
sxu32 nKeyword;
|
||||
/* Isolate UTF-8 or alphanumeric stream */
|
||||
if( pStream->zText[0] < 0xc0 ){
|
||||
pStream->zText++;
|
||||
}
|
||||
for(;;){
|
||||
zIn = pStream->zText;
|
||||
if( zIn[0] >= 0xc0 ){
|
||||
zIn++;
|
||||
/* UTF-8 stream */
|
||||
while( zIn < pStream->zEnd && ((zIn[0] & 0xc0) == 0x80) ){
|
||||
zIn++;
|
||||
}
|
||||
}
|
||||
/* Skip alphanumeric stream */
|
||||
while( zIn < pStream->zEnd && zIn[0] < 0xc0 && (SyisAlphaNum(zIn[0]) || zIn[0] == '_') ){
|
||||
zIn++;
|
||||
}
|
||||
if( zIn == pStream->zText ){
|
||||
/* Not an UTF-8 or alphanumeric stream */
|
||||
break;
|
||||
}
|
||||
/* Synchronize pointers */
|
||||
pStream->zText = zIn;
|
||||
}
|
||||
/* Record token length */
|
||||
pStr->nByte = (sxu32)((const char *)pStream->zText-pStr->zString);
|
||||
nKeyword = KeywordCode(pStr->zString,(int)pStr->nByte);
|
||||
if( nKeyword != PH7_TK_ID ){
|
||||
if( nKeyword &
|
||||
(PH7_TKWRD_NEW|PH7_TKWRD_CLONE|PH7_TKWRD_AND|PH7_TKWRD_XOR|PH7_TKWRD_OR|PH7_TKWRD_INSTANCEOF) ){
|
||||
/* Alpha stream operators [i.e: new,clone,and,instanceof,eq,ne,or,xor],save the operator instance for later processing */
|
||||
pToken->pUserData = (void *)PH7_ExprExtractOperator(pStr,0);
|
||||
/* Mark as an operator */
|
||||
pToken->nType = PH7_TK_ID|PH7_TK_OP;
|
||||
}else{
|
||||
/* We are dealing with a keyword [i.e: while,foreach,class...],save the keyword ID */
|
||||
pToken->nType = PH7_TK_KEYWORD;
|
||||
pToken->pUserData = SX_INT_TO_PTR(nKeyword);
|
||||
}
|
||||
}else{
|
||||
/* A simple identifier */
|
||||
pToken->nType = PH7_TK_ID;
|
||||
}
|
||||
}else{
|
||||
sxi32 c;
|
||||
/* Non-alpha stream */
|
||||
if( pStream->zText[0] == '#' ||
|
||||
( pStream->zText[0] == '/' && &pStream->zText[1] < pStream->zEnd && pStream->zText[1] == '/') ){
|
||||
pStream->zText++;
|
||||
/* Inline comments */
|
||||
while( pStream->zText < pStream->zEnd && pStream->zText[0] != '\n' ){
|
||||
pStream->zText++;
|
||||
}
|
||||
/* Tell the upper-layer to ignore this token */
|
||||
return SXERR_CONTINUE;
|
||||
}else if( pStream->zText[0] == '/' && &pStream->zText[1] < pStream->zEnd && pStream->zText[1] == '*' ){
|
||||
pStream->zText += 2;
|
||||
/* Block comment */
|
||||
while( pStream->zText < pStream->zEnd ){
|
||||
if( pStream->zText[0] == '*' ){
|
||||
if( &pStream->zText[1] >= pStream->zEnd || pStream->zText[1] == '/' ){
|
||||
break;
|
||||
}
|
||||
}
|
||||
if( pStream->zText[0] == '\n' ){
|
||||
pStream->nLine++;
|
||||
}
|
||||
pStream->zText++;
|
||||
}
|
||||
pStream->zText += 2;
|
||||
/* Tell the upper-layer to ignore this token */
|
||||
return SXERR_CONTINUE;
|
||||
}else if( SyisDigit(pStream->zText[0]) ){
|
||||
pStream->zText++;
|
||||
/* Decimal digit stream */
|
||||
while( pStream->zText < pStream->zEnd && pStream->zText[0] < 0xc0 && SyisDigit(pStream->zText[0]) ){
|
||||
pStream->zText++;
|
||||
}
|
||||
/* Mark the token as integer until we encounter a real number */
|
||||
pToken->nType = PH7_TK_INTEGER;
|
||||
if( pStream->zText < pStream->zEnd ){
|
||||
c = pStream->zText[0];
|
||||
if( c == '.' ){
|
||||
/* Real number */
|
||||
pStream->zText++;
|
||||
while( pStream->zText < pStream->zEnd && pStream->zText[0] < 0xc0 && SyisDigit(pStream->zText[0]) ){
|
||||
pStream->zText++;
|
||||
}
|
||||
if( pStream->zText < pStream->zEnd ){
|
||||
c = pStream->zText[0];
|
||||
if( c=='e' || c=='E' ){
|
||||
pStream->zText++;
|
||||
if( pStream->zText < pStream->zEnd ){
|
||||
c = pStream->zText[0];
|
||||
if( (c =='+' || c=='-') && &pStream->zText[1] < pStream->zEnd &&
|
||||
pStream->zText[1] < 0xc0 && SyisDigit(pStream->zText[1]) ){
|
||||
pStream->zText++;
|
||||
}
|
||||
while( pStream->zText < pStream->zEnd && pStream->zText[0] < 0xc0 && SyisDigit(pStream->zText[0]) ){
|
||||
pStream->zText++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
pToken->nType = PH7_TK_REAL;
|
||||
}else if( c=='e' || c=='E' ){
|
||||
SXUNUSED(pUserData); /* Prevent compiler warning */
|
||||
SXUNUSED(pCtxData);
|
||||
pStream->zText++;
|
||||
if( pStream->zText < pStream->zEnd ){
|
||||
c = pStream->zText[0];
|
||||
if( (c =='+' || c=='-') && &pStream->zText[1] < pStream->zEnd &&
|
||||
pStream->zText[1] < 0xc0 && SyisDigit(pStream->zText[1]) ){
|
||||
pStream->zText++;
|
||||
}
|
||||
while( pStream->zText < pStream->zEnd && pStream->zText[0] < 0xc0 && SyisDigit(pStream->zText[0]) ){
|
||||
pStream->zText++;
|
||||
}
|
||||
}
|
||||
pToken->nType = PH7_TK_REAL;
|
||||
}else if( c == 'x' || c == 'X' ){
|
||||
/* Hex digit stream */
|
||||
pStream->zText++;
|
||||
while( pStream->zText < pStream->zEnd && pStream->zText[0] < 0xc0 && SyisHex(pStream->zText[0]) ){
|
||||
pStream->zText++;
|
||||
}
|
||||
}else if(c == 'b' || c == 'B' ){
|
||||
/* Binary digit stream */
|
||||
pStream->zText++;
|
||||
while( pStream->zText < pStream->zEnd && (pStream->zText[0] == '0' || pStream->zText[0] == '1') ){
|
||||
pStream->zText++;
|
||||
}
|
||||
}
|
||||
}
|
||||
/* Record token length */
|
||||
pStr->nByte = (sxu32)((const char *)pStream->zText-pStr->zString);
|
||||
return SXRET_OK;
|
||||
}
|
||||
c = pStream->zText[0];
|
||||
pStream->zText++; /* Advance the stream cursor */
|
||||
/* Assume we are dealing with an operator*/
|
||||
pToken->nType = PH7_TK_OP;
|
||||
switch(c){
|
||||
case '$': pToken->nType = PH7_TK_DOLLAR; break;
|
||||
case '{': pToken->nType = PH7_TK_OCB; break;
|
||||
case '}': pToken->nType = PH7_TK_CCB; break;
|
||||
case '(': pToken->nType = PH7_TK_LPAREN; break;
|
||||
case '[': pToken->nType |= PH7_TK_OSB; break; /* Bitwise operation here,since the square bracket token '['
|
||||
* is a potential operator [i.e: subscripting] */
|
||||
case ']': pToken->nType = PH7_TK_CSB; break;
|
||||
case ')': {
|
||||
SySet *pTokSet = pStream->pSet;
|
||||
/* Assemble type cast operators [i.e: (int),(float),(bool)...] */
|
||||
if( pTokSet->nUsed >= 2 ){
|
||||
SyToken *pTmp;
|
||||
/* Peek the last recongnized token */
|
||||
pTmp = (SyToken *)SySetPeek(pTokSet);
|
||||
if( pTmp->nType & PH7_TK_KEYWORD ){
|
||||
sxi32 nID = SX_PTR_TO_INT(pTmp->pUserData);
|
||||
if( (sxu32)nID & (PH7_TKWRD_ARRAY|PH7_TKWRD_INT|PH7_TKWRD_FLOAT|PH7_TKWRD_STRING|PH7_TKWRD_OBJECT|PH7_TKWRD_BOOL|PH7_TKWRD_UNSET) ){
|
||||
pTmp = (SyToken *)SySetAt(pTokSet,pTokSet->nUsed - 2);
|
||||
if( pTmp->nType & PH7_TK_LPAREN ){
|
||||
/* Merge the three tokens '(' 'TYPE' ')' into a single one */
|
||||
const char * zTypeCast = "(int)";
|
||||
if( nID & PH7_TKWRD_FLOAT ){
|
||||
zTypeCast = "(float)";
|
||||
}else if( nID & PH7_TKWRD_BOOL ){
|
||||
zTypeCast = "(bool)";
|
||||
}else if( nID & PH7_TKWRD_STRING ){
|
||||
zTypeCast = "(string)";
|
||||
}else if( nID & PH7_TKWRD_ARRAY ){
|
||||
zTypeCast = "(array)";
|
||||
}else if( nID & PH7_TKWRD_OBJECT ){
|
||||
zTypeCast = "(object)";
|
||||
}else if( nID & PH7_TKWRD_UNSET ){
|
||||
zTypeCast = "(unset)";
|
||||
}
|
||||
/* Reflect the change */
|
||||
pToken->nType = PH7_TK_OP;
|
||||
SyStringInitFromBuf(&pToken->sData,zTypeCast,SyStrlen(zTypeCast));
|
||||
/* Save the instance associated with the type cast operator */
|
||||
pToken->pUserData = (void *)PH7_ExprExtractOperator(&pToken->sData,0);
|
||||
/* Remove the two previous tokens */
|
||||
pTokSet->nUsed -= 2;
|
||||
return SXRET_OK;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
pToken->nType = PH7_TK_RPAREN;
|
||||
break;
|
||||
}
|
||||
case '\'':{
|
||||
/* Single quoted string */
|
||||
pStr->zString++;
|
||||
while( pStream->zText < pStream->zEnd ){
|
||||
if( pStream->zText[0] == '\'' ){
|
||||
if( pStream->zText[-1] != '\\' ){
|
||||
break;
|
||||
}else{
|
||||
const unsigned char *zPtr = &pStream->zText[-2];
|
||||
sxi32 i = 1;
|
||||
while( zPtr > pStream->zInput && zPtr[0] == '\\' ){
|
||||
zPtr--;
|
||||
i++;
|
||||
}
|
||||
if((i&1)==0){
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if( pStream->zText[0] == '\n' ){
|
||||
pStream->nLine++;
|
||||
}
|
||||
pStream->zText++;
|
||||
}
|
||||
/* Record token length and type */
|
||||
pStr->nByte = (sxu32)((const char *)pStream->zText-pStr->zString);
|
||||
pToken->nType = PH7_TK_SSTR;
|
||||
/* Jump the trailing single quote */
|
||||
pStream->zText++;
|
||||
return SXRET_OK;
|
||||
}
|
||||
case '"':{
|
||||
sxi32 iNest;
|
||||
/* Double quoted string */
|
||||
pStr->zString++;
|
||||
while( pStream->zText < pStream->zEnd ){
|
||||
if( pStream->zText[0] == '{' && &pStream->zText[1] < pStream->zEnd && pStream->zText[1] == '$'){
|
||||
iNest = 1;
|
||||
pStream->zText++;
|
||||
/* TICKET 1433-40: Hnadle braces'{}' in double quoted string where everything is allowed */
|
||||
while(pStream->zText < pStream->zEnd ){
|
||||
if( pStream->zText[0] == '{' ){
|
||||
iNest++;
|
||||
}else if (pStream->zText[0] == '}' ){
|
||||
iNest--;
|
||||
if( iNest <= 0 ){
|
||||
pStream->zText++;
|
||||
break;
|
||||
}
|
||||
}else if( pStream->zText[0] == '\n' ){
|
||||
pStream->nLine++;
|
||||
}
|
||||
pStream->zText++;
|
||||
}
|
||||
if( pStream->zText >= pStream->zEnd ){
|
||||
break;
|
||||
}
|
||||
}
|
||||
if( pStream->zText[0] == '"' ){
|
||||
if( pStream->zText[-1] != '\\' ){
|
||||
break;
|
||||
}else{
|
||||
const unsigned char *zPtr = &pStream->zText[-2];
|
||||
sxi32 i = 1;
|
||||
while( zPtr > pStream->zInput && zPtr[0] == '\\' ){
|
||||
zPtr--;
|
||||
i++;
|
||||
}
|
||||
if((i&1)==0){
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if( pStream->zText[0] == '\n' ){
|
||||
pStream->nLine++;
|
||||
}
|
||||
pStream->zText++;
|
||||
}
|
||||
/* Record token length and type */
|
||||
pStr->nByte = (sxu32)((const char *)pStream->zText-pStr->zString);
|
||||
pToken->nType = PH7_TK_DSTR;
|
||||
/* Jump the trailing quote */
|
||||
pStream->zText++;
|
||||
return SXRET_OK;
|
||||
}
|
||||
case '`':{
|
||||
/* Backtick quoted string */
|
||||
pStr->zString++;
|
||||
while( pStream->zText < pStream->zEnd ){
|
||||
if( pStream->zText[0] == '`' && pStream->zText[-1] != '\\' ){
|
||||
break;
|
||||
}
|
||||
if( pStream->zText[0] == '\n' ){
|
||||
pStream->nLine++;
|
||||
}
|
||||
pStream->zText++;
|
||||
}
|
||||
/* Record token length and type */
|
||||
pStr->nByte = (sxu32)((const char *)pStream->zText-pStr->zString);
|
||||
pToken->nType = PH7_TK_BSTR;
|
||||
/* Jump the trailing backtick */
|
||||
pStream->zText++;
|
||||
return SXRET_OK;
|
||||
}
|
||||
case '\\': pToken->nType = PH7_TK_NSSEP; break;
|
||||
case ':':
|
||||
if( pStream->zText < pStream->zEnd && pStream->zText[0] == ':' ){
|
||||
/* Current operator: '::' */
|
||||
pStream->zText++;
|
||||
}else{
|
||||
pToken->nType = PH7_TK_COLON; /* Single colon */
|
||||
}
|
||||
break;
|
||||
case ',': pToken->nType |= PH7_TK_COMMA; break; /* Comma is also an operator */
|
||||
case ';': pToken->nType = PH7_TK_SEMI; break;
|
||||
/* Handle combined operators [i.e: +=,===,!=== ...] */
|
||||
case '=':
|
||||
pToken->nType |= PH7_TK_EQUAL;
|
||||
if( pStream->zText < pStream->zEnd ){
|
||||
if( pStream->zText[0] == '=' ){
|
||||
pToken->nType &= ~PH7_TK_EQUAL;
|
||||
/* Current operator: == */
|
||||
pStream->zText++;
|
||||
if( pStream->zText < pStream->zEnd && pStream->zText[0] == '=' ){
|
||||
/* Current operator: === */
|
||||
pStream->zText++;
|
||||
}
|
||||
}else if( pStream->zText[0] == '>' ){
|
||||
/* Array operator: => */
|
||||
pToken->nType = PH7_TK_ARRAY_OP;
|
||||
pStream->zText++;
|
||||
}else{
|
||||
/* TICKET 1433-0010: Reference operator '=&' */
|
||||
const unsigned char *zCur = pStream->zText;
|
||||
sxu32 nLine = 0;
|
||||
while( zCur < pStream->zEnd && zCur[0] < 0xc0 && SyisSpace(zCur[0]) ){
|
||||
if( zCur[0] == '\n' ){
|
||||
nLine++;
|
||||
}
|
||||
zCur++;
|
||||
}
|
||||
if( zCur < pStream->zEnd && zCur[0] == '&' ){
|
||||
/* Current operator: =& */
|
||||
pToken->nType &= ~PH7_TK_EQUAL;
|
||||
SyStringInitFromBuf(pStr,"=&",sizeof("=&")-1);
|
||||
/* Update token stream */
|
||||
pStream->zText = &zCur[1];
|
||||
pStream->nLine += nLine;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
case '!':
|
||||
if( pStream->zText < pStream->zEnd && pStream->zText[0] == '=' ){
|
||||
/* Current operator: != */
|
||||
pStream->zText++;
|
||||
if( pStream->zText < pStream->zEnd && pStream->zText[0] == '=' ){
|
||||
/* Current operator: !== */
|
||||
pStream->zText++;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case '&':
|
||||
pToken->nType |= PH7_TK_AMPER;
|
||||
if( pStream->zText < pStream->zEnd ){
|
||||
if( pStream->zText[0] == '&' ){
|
||||
pToken->nType &= ~PH7_TK_AMPER;
|
||||
/* Current operator: && */
|
||||
pStream->zText++;
|
||||
}else if( pStream->zText[0] == '=' ){
|
||||
pToken->nType &= ~PH7_TK_AMPER;
|
||||
/* Current operator: &= */
|
||||
pStream->zText++;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case '|':
|
||||
if( pStream->zText < pStream->zEnd ){
|
||||
if( pStream->zText[0] == '|' ){
|
||||
/* Current operator: || */
|
||||
pStream->zText++;
|
||||
}else if( pStream->zText[0] == '=' ){
|
||||
/* Current operator: |= */
|
||||
pStream->zText++;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case '+':
|
||||
if( pStream->zText < pStream->zEnd ){
|
||||
if( pStream->zText[0] == '+' ){
|
||||
/* Current operator: ++ */
|
||||
pStream->zText++;
|
||||
}else if( pStream->zText[0] == '=' ){
|
||||
/* Current operator: += */
|
||||
pStream->zText++;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case '-':
|
||||
if( pStream->zText < pStream->zEnd ){
|
||||
if( pStream->zText[0] == '-' ){
|
||||
/* Current operator: -- */
|
||||
pStream->zText++;
|
||||
}else if( pStream->zText[0] == '=' ){
|
||||
/* Current operator: -= */
|
||||
pStream->zText++;
|
||||
}else if( pStream->zText[0] == '>' ){
|
||||
/* Current operator: -> */
|
||||
pStream->zText++;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case '*':
|
||||
if( pStream->zText < pStream->zEnd && pStream->zText[0] == '=' ){
|
||||
/* Current operator: *= */
|
||||
pStream->zText++;
|
||||
}
|
||||
break;
|
||||
case '/':
|
||||
if( pStream->zText < pStream->zEnd && pStream->zText[0] == '=' ){
|
||||
/* Current operator: /= */
|
||||
pStream->zText++;
|
||||
}
|
||||
break;
|
||||
case '%':
|
||||
if( pStream->zText < pStream->zEnd && pStream->zText[0] == '=' ){
|
||||
/* Current operator: %= */
|
||||
pStream->zText++;
|
||||
}
|
||||
break;
|
||||
case '^':
|
||||
if( pStream->zText < pStream->zEnd ){
|
||||
if( pStream->zText[0] == '=' ){
|
||||
/* Current operator: ^= */
|
||||
pStream->zText++;
|
||||
}else if( pStream->zText[0] == '^' ){
|
||||
/* Current operator: ^^ */
|
||||
pStream->zText++;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case '.':
|
||||
if( pStream->zText < pStream->zEnd && pStream->zText[0] == '=' ){
|
||||
/* Current operator: .= */
|
||||
pStream->zText++;
|
||||
}
|
||||
break;
|
||||
case '<':
|
||||
if( pStream->zText < pStream->zEnd ){
|
||||
if( pStream->zText[0] == '<' ){
|
||||
/* Current operator: << */
|
||||
pStream->zText++;
|
||||
if( pStream->zText < pStream->zEnd ){
|
||||
if( pStream->zText[0] == '=' ){
|
||||
/* Current operator: <<= */
|
||||
pStream->zText++;
|
||||
}
|
||||
}
|
||||
}else if( pStream->zText[0] == '>' ){
|
||||
/* Current operator: <> */
|
||||
pStream->zText++;
|
||||
}else if( pStream->zText[0] == '=' ){
|
||||
/* Current operator: <= */
|
||||
pStream->zText++;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case '>':
|
||||
if( pStream->zText < pStream->zEnd ){
|
||||
if( pStream->zText[0] == '>' ){
|
||||
/* Current operator: >> */
|
||||
pStream->zText++;
|
||||
if( pStream->zText < pStream->zEnd && pStream->zText[0] == '=' ){
|
||||
/* Current operator: >>= */
|
||||
pStream->zText++;
|
||||
}
|
||||
}else if( pStream->zText[0] == '=' ){
|
||||
/* Current operator: >= */
|
||||
pStream->zText++;
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
if( pStr->nByte <= 0 ){
|
||||
/* Record token length */
|
||||
pStr->nByte = (sxu32)((const char *)pStream->zText-pStr->zString);
|
||||
}
|
||||
if( pToken->nType & PH7_TK_OP ){
|
||||
const ph7_expr_op *pOp;
|
||||
/* Check if the extracted token is an operator */
|
||||
pOp = PH7_ExprExtractOperator(pStr,(SyToken *)SySetPeek(pStream->pSet));
|
||||
if( pOp == 0 ){
|
||||
/* Not an operator */
|
||||
pToken->nType &= ~PH7_TK_OP;
|
||||
if( pToken->nType <= 0 ){
|
||||
pToken->nType = PH7_TK_OTHER;
|
||||
}
|
||||
}else{
|
||||
/* Save the instance associated with this operator for later processing */
|
||||
pToken->pUserData = (void *)pOp;
|
||||
}
|
||||
}
|
||||
}
|
||||
/* Tell the upper-layer to save the extracted token for later processing */
|
||||
return SXRET_OK;
|
||||
}
|
||||
|
||||
static sxu32 KeywordCode(const char *z, int n){
|
||||
typedef struct {
|
||||
char *token;
|
||||
int value;
|
||||
} ph7_token;
|
||||
static ph7_token pTokenLookup[] = {
|
||||
{"extends", PH7_TKWRD_EXTENDS},
|
||||
{"endswitch", PH7_TKWRD_ENDSWITCH},
|
||||
{"switch", PH7_TKWRD_SWITCH},
|
||||
{"print", PH7_TKWRD_PRINT},
|
||||
{"int", PH7_TKWRD_INT},
|
||||
{"require_once", PH7_TKWRD_REQONCE},
|
||||
{"require", PH7_TKWRD_REQUIRE},
|
||||
{"enddeclare", PH7_TKWRD_ENDDEC},
|
||||
{"declare", PH7_TKWRD_DECLARE},
|
||||
{"return", PH7_TKWRD_RETURN},
|
||||
{"namespace", PH7_TKWRD_NAMESPACE},
|
||||
{"echo", PH7_TKWRD_ECHO},
|
||||
{"object", PH7_TKWRD_OBJECT},
|
||||
{"throw", PH7_TKWRD_THROW},
|
||||
{"bool", PH7_TKWRD_BOOL},
|
||||
{"boolean", PH7_TKWRD_BOOL},
|
||||
{"and", PH7_TKWRD_AND},
|
||||
{"default", PH7_TKWRD_DEFAULT},
|
||||
{"try", PH7_TKWRD_TRY},
|
||||
{"case", PH7_TKWRD_CASE},
|
||||
{"self", PH7_TKWRD_SELF},
|
||||
{"final", PH7_TKWRD_FINAL},
|
||||
{"list", PH7_TKWRD_LIST},
|
||||
{"static", PH7_TKWRD_STATIC},
|
||||
{"clone", PH7_TKWRD_CLONE},
|
||||
{"new", PH7_TKWRD_NEW},
|
||||
{"const", PH7_TKWRD_CONST},
|
||||
{"string", PH7_TKWRD_STRING},
|
||||
{"global", PH7_TKWRD_GLOBAL},
|
||||
{"use", PH7_TKWRD_USE},
|
||||
{"elseif", PH7_TKWRD_ELIF},
|
||||
{"elif", PH7_TKWRD_ELIF},
|
||||
{"else", PH7_TKWRD_ELSE},
|
||||
{"if", PH7_TKWRD_IF},
|
||||
{"double", PH7_TKWRD_FLOAT},
|
||||
{"float", PH7_TKWRD_FLOAT},
|
||||
{"var", PH7_TKWRD_VAR},
|
||||
{"array", PH7_TKWRD_ARRAY},
|
||||
{"die", PH7_TKWRD_DIE},
|
||||
{"abstract", PH7_TKWRD_ABSTRACT},
|
||||
{"class", PH7_TKWRD_CLASS},
|
||||
{"as", PH7_TKWRD_AS},
|
||||
{"continue", PH7_TKWRD_CONTINUE},
|
||||
{"endif", PH7_TKWRD_ENDIF},
|
||||
{"function", PH7_TKWRD_FUNCTION},
|
||||
{"endwhile", PH7_TKWRD_ENDWHILE},
|
||||
{"while", PH7_TKWRD_WHILE},
|
||||
{"eval", PH7_TKWRD_EVAL},
|
||||
{"do", PH7_TKWRD_DO},
|
||||
{"exit", PH7_TKWRD_EXIT},
|
||||
{"implements", PH7_TKWRD_IMPLEMENTS},
|
||||
{"include_once", PH7_TKWRD_INCONCE},
|
||||
{"include", PH7_TKWRD_INCLUDE},
|
||||
{"empty", PH7_TKWRD_EMPTY},
|
||||
{"instanceof", PH7_TKWRD_INSTANCEOF},
|
||||
{"interface", PH7_TKWRD_INTERFACE},
|
||||
{"integer", PH7_TKWRD_INT},
|
||||
{"endfor", PH7_TKWRD_ENDFOR},
|
||||
{"endforeach", PH7_TKWRD_END4EACH},
|
||||
{"for", PH7_TKWRD_FOR},
|
||||
{"foreach", PH7_TKWRD_FOREACH},
|
||||
{"or", PH7_TKWRD_OR},
|
||||
{"isset", PH7_TKWRD_ISSET},
|
||||
{"parent", PH7_TKWRD_PARENT},
|
||||
{"private", PH7_TKWRD_PRIVATE},
|
||||
{"protected", PH7_TKWRD_PROTECTED},
|
||||
{"public", PH7_TKWRD_PUBLIC},
|
||||
{"catch", PH7_TKWRD_CATCH},
|
||||
{"unset", PH7_TKWRD_UNSET},
|
||||
{"xor", PH7_TKWRD_XOR},
|
||||
{"break", PH7_TKWRD_BREAK}
|
||||
};
|
||||
if(n < 2) {
|
||||
return PH7_TK_ID;
|
||||
} else {
|
||||
for(ph7_token *pToken = pTokenLookup; pToken != pTokenLookup + sizeof(pTokenLookup) / sizeof(pTokenLookup[0]); pToken++) {
|
||||
if(SyMemcmp(pToken->token, z, n) == 0) {
|
||||
return pToken->value;
|
||||
}
|
||||
}
|
||||
return PH7_TK_ID;
|
||||
}
|
||||
}
|
||||
/*
|
||||
* Tokenize a raw PHP input.
|
||||
* This is the public tokenizer called by most code generator routines.
|
||||
*/
|
||||
PH7_PRIVATE sxi32 PH7_TokenizePHP(const char *zInput,sxu32 nLen,sxu32 nLineStart,SySet *pOut)
|
||||
{
|
||||
SyLex sLexer;
|
||||
sxi32 rc;
|
||||
/* Initialize the lexer */
|
||||
rc = SyLexInit(&sLexer,&(*pOut),TokenizePHP,0);
|
||||
if( rc != SXRET_OK ){
|
||||
return rc;
|
||||
}
|
||||
sLexer.sStream.nLine = nLineStart;
|
||||
/* Tokenize input */
|
||||
rc = SyLexTokenizeInput(&sLexer,zInput,nLen,0,0,0);
|
||||
/* Release the lexer */
|
||||
SyLexRelease(&sLexer);
|
||||
/* Tokenization result */
|
||||
return rc;
|
||||
}
|
||||
/*
|
||||
* High level public tokenizer.
|
||||
* Tokenize the input into PHP tokens and raw tokens [i.e: HTML,XML,Raw text...].
|
||||
* According to the PHP language reference manual
|
||||
* When PHP parses a file, it looks for opening and closing tags, which tell PHP
|
||||
* to start and stop interpreting the code between them. Parsing in this manner allows
|
||||
* PHP to be embedded in all sorts of different documents, as everything outside of a pair
|
||||
* of opening and closing tags is ignored by the PHP parser. Most of the time you will see
|
||||
* PHP embedded in HTML documents, as in this example.
|
||||
* <?php echo 'While this is going to be parsed.'; ?>
|
||||
* <p>This will also be ignored.</p>
|
||||
* You can also use more advanced structures:
|
||||
* Example #1 Advanced escaping
|
||||
* <?php
|
||||
* if ($expression) {
|
||||
* ?>
|
||||
* <strong>This is true.</strong>
|
||||
* <?php
|
||||
* } else {
|
||||
* ?>
|
||||
* <strong>This is false.</strong>
|
||||
* <?php
|
||||
* }
|
||||
* ?>
|
||||
* This works as expected, because when PHP hits the ?> closing tags, it simply starts outputting
|
||||
* whatever it finds (except for an immediately following newline - see instruction separation ) until it hits
|
||||
* another opening tag. The example given here is contrived, of course, but for outputting large blocks of text
|
||||
* dropping out of PHP parsing mode is generally more efficient than sending all of the text through echo() or print().
|
||||
* There are four different pairs of opening and closing tags which can be used in PHP. Three of those, <?php ?>
|
||||
* <script language="php"> </script> and <? ?> are always available. The other two are short tags and ASP style
|
||||
* tags, and can be turned on and off from the php.ini configuration file. As such, while some people find short tags
|
||||
* and ASP style tags convenient, they are less portable, and generally not recommended.
|
||||
* Note:
|
||||
* Also note that if you are embedding PHP within XML or XHTML you will need to use the <?php ?> tags to remain
|
||||
* compliant with standards.
|
||||
* Example #2 PHP Opening and Closing Tags
|
||||
* 1. <?php echo 'if you want to serve XHTML or XML documents, do it like this'; ?>
|
||||
* 2. <script language="php">
|
||||
* echo 'some editors (like FrontPage) don\'t
|
||||
* like processing instructions';
|
||||
* </script>
|
||||
*
|
||||
* 3. <? echo 'this is the simplest, an SGML processing instruction'; ?>
|
||||
* <?= expression ?> This is a shortcut for "<? echo expression ?>"
|
||||
*/
|
||||
PH7_PRIVATE sxi32 PH7_TokenizeRawText(const char *zInput,sxu32 nLen,SySet *pOut)
|
||||
{
|
||||
const char *zEnd = &zInput[nLen];
|
||||
const char *zIn = zInput;
|
||||
const char *zCur,*zCurEnd;
|
||||
SyString sCtag = { 0, 0 }; /* Closing tag */
|
||||
SyToken sToken;
|
||||
SyString sDoc;
|
||||
sxu32 nLine;
|
||||
sxi32 iNest;
|
||||
sxi32 rc;
|
||||
/* Tokenize the input into PHP tokens and raw tokens */
|
||||
nLine = 1;
|
||||
zCur = zCurEnd = 0; /* Prevent compiler warning */
|
||||
sToken.pUserData = 0;
|
||||
iNest = 0;
|
||||
sDoc.nByte = 0;
|
||||
sDoc.zString = ""; /* cc warning */
|
||||
for(;;){
|
||||
if( zIn >= zEnd ){
|
||||
/* End of input reached */
|
||||
break;
|
||||
}
|
||||
sToken.nLine = nLine;
|
||||
zCur = zIn;
|
||||
zCurEnd = 0;
|
||||
while( zIn < zEnd ){
|
||||
if( zIn[0] == '<' ){
|
||||
const char *zTmp = zIn; /* End of raw input marker */
|
||||
zIn++;
|
||||
if( zIn < zEnd ){
|
||||
if( zIn[0] == '?' ){
|
||||
zIn++;
|
||||
if( (sxu32)(zEnd - zIn) >= sizeof("php")-1 && SyStrnicmp(zIn,"php",sizeof("php")-1) == 0 ){
|
||||
/* opening tag: <?php */
|
||||
zIn += sizeof("php")-1;
|
||||
}
|
||||
/* Look for the closing tag '?>' */
|
||||
SyStringInitFromBuf(&sCtag,"?>",sizeof("?>")-1);
|
||||
zCurEnd = zTmp;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}else{
|
||||
if( zIn[0] == '\n' ){
|
||||
nLine++;
|
||||
}
|
||||
zIn++;
|
||||
}
|
||||
} /* While(zIn < zEnd) */
|
||||
if( zCurEnd == 0 ){
|
||||
zCurEnd = zIn;
|
||||
}
|
||||
/* Save the raw token */
|
||||
SyStringInitFromBuf(&sToken.sData,zCur,zCurEnd - zCur);
|
||||
sToken.nType = PH7_TOKEN_RAW;
|
||||
rc = SySetPut(&(*pOut),(const void *)&sToken);
|
||||
if( rc != SXRET_OK ){
|
||||
return rc;
|
||||
}
|
||||
if( zIn >= zEnd ){
|
||||
break;
|
||||
}
|
||||
/* Ignore leading white space */
|
||||
while( zIn < zEnd && (unsigned char)zIn[0] < 0xc0 && SyisSpace(zIn[0]) ){
|
||||
if( zIn[0] == '\n' ){
|
||||
nLine++;
|
||||
}
|
||||
zIn++;
|
||||
}
|
||||
/* Delimit the PHP chunk */
|
||||
sToken.nLine = nLine;
|
||||
zCur = zIn;
|
||||
while( (sxu32)(zEnd - zIn) >= sCtag.nByte ){
|
||||
const char *zPtr;
|
||||
if( SyMemcmp(zIn,sCtag.zString,sCtag.nByte) == 0 && iNest < 1 ){
|
||||
break;
|
||||
}
|
||||
for(;;){
|
||||
if( zIn[0] != '/' || (zIn[1] != '*' && zIn[1] != '/') /* && sCtag.nByte >= 2 */ ){
|
||||
break;
|
||||
}
|
||||
zIn += 2;
|
||||
if( zIn[-1] == '/' ){
|
||||
/* Inline comment */
|
||||
while( zIn < zEnd && zIn[0] != '\n' ){
|
||||
zIn++;
|
||||
}
|
||||
if( zIn >= zEnd ){
|
||||
zIn--;
|
||||
}
|
||||
}else{
|
||||
/* Block comment */
|
||||
while( (sxu32)(zEnd-zIn) >= sizeof("*/") - 1 ){
|
||||
if( zIn[0] == '*' && zIn[1] == '/' ){
|
||||
zIn += 2;
|
||||
break;
|
||||
}
|
||||
if( zIn[0] == '\n' ){
|
||||
nLine++;
|
||||
}
|
||||
zIn++;
|
||||
}
|
||||
}
|
||||
}
|
||||
if( zIn[0] == '\n' ){
|
||||
nLine++;
|
||||
if( iNest > 0 ){
|
||||
zIn++;
|
||||
while( zIn < zEnd && (unsigned char)zIn[0] < 0xc0 && SyisSpace(zIn[0]) && zIn[0] != '\n' ){
|
||||
zIn++;
|
||||
}
|
||||
zPtr = zIn;
|
||||
while( zIn < zEnd ){
|
||||
if( (unsigned char)zIn[0] >= 0xc0 ){
|
||||
/* UTF-8 stream */
|
||||
zIn++;
|
||||
SX_JMP_UTF8(zIn,zEnd);
|
||||
}else if( !SyisAlphaNum(zIn[0]) && zIn[0] != '_' ){
|
||||
break;
|
||||
}else{
|
||||
zIn++;
|
||||
}
|
||||
}
|
||||
if( (sxu32)(zIn - zPtr) == sDoc.nByte && SyMemcmp(sDoc.zString,zPtr,sDoc.nByte) == 0 ){
|
||||
iNest = 0;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}else if ( (sxu32)(zEnd - zIn) >= sizeof("<<<") && zIn[0] == '<' && zIn[1] == '<' && zIn[2] == '<' && iNest < 1){
|
||||
zIn += sizeof("<<<")-1;
|
||||
while( zIn < zEnd && (unsigned char)zIn[0] < 0xc0 && SyisSpace(zIn[0]) && zIn[0] != '\n' ){
|
||||
zIn++;
|
||||
}
|
||||
if( zIn[0] == '"' || zIn[0] == '\'' ){
|
||||
zIn++;
|
||||
}
|
||||
zPtr = zIn;
|
||||
while( zIn < zEnd ){
|
||||
if( (unsigned char)zIn[0] >= 0xc0 ){
|
||||
/* UTF-8 stream */
|
||||
zIn++;
|
||||
SX_JMP_UTF8(zIn,zEnd);
|
||||
}else if( !SyisAlphaNum(zIn[0]) && zIn[0] != '_' ){
|
||||
break;
|
||||
}else{
|
||||
zIn++;
|
||||
}
|
||||
}
|
||||
SyStringInitFromBuf(&sDoc,zPtr,zIn-zPtr);
|
||||
SyStringFullTrim(&sDoc);
|
||||
if( sDoc.nByte > 0 ){
|
||||
iNest++;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
zIn++;
|
||||
|
||||
if ( zIn >= zEnd )
|
||||
break;
|
||||
}
|
||||
if( (sxu32)(zEnd - zIn) < sCtag.nByte ){
|
||||
zIn = zEnd;
|
||||
}
|
||||
if( zCur < zIn ){
|
||||
/* Save the PHP chunk for later processing */
|
||||
sToken.nType = PH7_TOKEN_PHP;
|
||||
SyStringInitFromBuf(&sToken.sData,zCur,zIn-zCur);
|
||||
SyStringRightTrim(&sToken.sData); /* Trim trailing white spaces */
|
||||
rc = SySetPut(&(*pOut),(const void *)&sToken);
|
||||
if( rc != SXRET_OK ){
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
if( zIn < zEnd ){
|
||||
/* Jump the trailing closing tag */
|
||||
zIn += sCtag.nByte;
|
||||
}
|
||||
} /* For(;;) */
|
||||
|
||||
return SXRET_OK;
|
||||
}
|
||||
Reference in New Issue
Block a user