You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
880 lines
27 KiB
880 lines
27 KiB
#include <fstream> |
|
#include <iostream> |
|
#include <vector> |
|
#include <map> |
|
#include <stack> |
|
#include <string.h> |
|
#include "Parser.h" |
|
#include "Object.h" |
|
#include "Exception.h" |
|
#include "Utils.h" |
|
|
|
using namespace merge_lib; |
|
using namespace std; |
|
|
|
const std::string Parser::WHITESPACES(" \t\f\v\n\r"); |
|
const std::string Parser::DELIMETERS("()<>{}/%]["); |
|
const std::string Parser::NUMBERS("0123456789"); |
|
const std::string Parser::WHITESPACES_AND_DELIMETERS = Parser::WHITESPACES + Parser::DELIMETERS; |
|
|
|
Document * Parser::parseDocument(const char * fileName) |
|
{ |
|
_document = new Document(fileName); |
|
try |
|
{ |
|
_createObjectTree(fileName); |
|
_createDocument(fileName); |
|
} |
|
catch( std::exception &) |
|
{ |
|
_clearParser(); |
|
delete _document; |
|
_document = NULL; |
|
throw; |
|
} |
|
return _document; |
|
} |
|
|
|
void Parser::_retrieveAllPages(Object * objectWithKids) |
|
{ |
|
std::string & objectContent = objectWithKids->getObjectContent(); |
|
unsigned int startOfKids = objectContent.find("/Kids"); |
|
unsigned int endOfKids = objectContent.find("]", startOfKids); |
|
if( |
|
(startOfKids == std::string::npos) && |
|
(objectContent.find("/Page") != std::string::npos) |
|
) |
|
{ |
|
unsigned int numberOfPages = _document->_pages.size() + 1; |
|
Page * newPage = new Page(numberOfPages); |
|
newPage->_root = objectWithKids; |
|
_document->_pages.insert(std::pair<unsigned int, Page *>(numberOfPages, newPage)); |
|
return; |
|
} |
|
|
|
const std::vector<Object *> & kids = objectWithKids->getSortedByPositionChildren(startOfKids, endOfKids); |
|
for(size_t i(0); i < kids.size(); ++i) |
|
{ |
|
_retrieveAllPages(kids[i]); |
|
} |
|
} |
|
|
|
void Parser::_createDocument(const char * docName) |
|
{ |
|
_document->_root = _root; |
|
Object * objectWithPages = 0; |
|
std::string & rootContent = _root->getObjectContent(); |
|
unsigned int startOfPages = rootContent.find("/Pages"); |
|
if(startOfPages == std::string::npos) |
|
throw Exception("Some document is wrong"); |
|
unsigned int endOfPages = rootContent.find("R", startOfPages); |
|
std::vector<Object *> objectWithKids = _root->getChildrenByBounds(startOfPages, endOfPages); |
|
if(objectWithKids.size() != 1) |
|
throw Exception("Some document is wrong"); |
|
_retrieveAllPages(objectWithKids[0]); |
|
|
|
_root->retrieveMaxObjectNumber(_document->_maxObjectNumber); |
|
_clearParser(); |
|
} |
|
|
|
void Parser::_clearParser() |
|
{ |
|
_root = 0; |
|
_fileContent.clear(); |
|
_fileContent.reserve(); |
|
_objects.clear(); |
|
} |
|
|
|
|
|
void Parser::_getFileContent(const char * fileName) |
|
{ |
|
ifstream pdfFile; |
|
pdfFile.open (fileName, ios::binary ); |
|
if (pdfFile.fail()) |
|
{ |
|
stringstream errorMessage("File "); |
|
errorMessage << fileName << " is absent" << "\0"; |
|
throw Exception(errorMessage); |
|
} |
|
// get length of file: |
|
pdfFile.seekg (0, ios::end); |
|
int length = pdfFile.tellg(); |
|
pdfFile.seekg (0, ios::beg); |
|
_fileContent.resize(length); |
|
pdfFile.read(&_fileContent[0], length); |
|
|
|
// check version |
|
const char *header = "%PDF-1."; |
|
size_t verPos = _fileContent.find(header); |
|
if( verPos == 0 ) |
|
{ |
|
verPos += strlen(header); |
|
char ver = _fileContent[verPos]; |
|
if( ver < '0' || ver > '4' ) |
|
{ |
|
stringstream errorMsg; |
|
errorMsg<<" File with verion 1."<<ver<<" is not currently supported by merge library\n"; |
|
throw Exception(errorMsg); |
|
} |
|
} |
|
else |
|
{ |
|
throw Exception("Unrecognized header of PDF file"); |
|
} |
|
pdfFile.close(); |
|
} |
|
|
|
|
|
void Parser::_createObjectTree(const char * fileName) |
|
{ |
|
unsigned int rootObjectNumber = 0; |
|
try |
|
{ |
|
_getFileContent(fileName); |
|
_readXRefAndCreateObjects(); |
|
rootObjectNumber = _readTrailerAndReturnRoot(); |
|
} |
|
catch (std::exception &) |
|
{ |
|
std::map<unsigned int, Object *>::const_iterator it(_objects.begin()); |
|
for(;it != _objects.end();it++) |
|
{ |
|
delete (*it).second; |
|
} |
|
_objects.clear(); |
|
throw; |
|
} |
|
|
|
std::map<unsigned int, Object *>::iterator objectsIterator; |
|
|
|
for ( objectsIterator = _objects.begin() ; objectsIterator != _objects.end(); objectsIterator++ ) |
|
{ |
|
Object * currentObject = (*objectsIterator).second; |
|
_document->_allObjects.push_back(currentObject); |
|
//key - object number : value - positions in object content of this reference |
|
const std::map<unsigned int, Object::ReferencePositionsInContent> & refs = |
|
_getReferences(currentObject->getObjectContent()); |
|
std::map<unsigned int, Object::ReferencePositionsInContent>::const_iterator refsIterator = refs.begin(); |
|
for(; refsIterator != refs.end(); ++refsIterator) |
|
{ |
|
if(_objects.count((*refsIterator).first)) |
|
currentObject->addChild(_objects[(*refsIterator).first], (*refsIterator).second); |
|
} |
|
} |
|
_root = _objects[rootObjectNumber]; |
|
|
|
} |
|
|
|
const std::map<unsigned int, Object::ReferencePositionsInContent> & Parser::_getReferences(const std::string & objectContent) |
|
{ |
|
unsigned int currentPosition(0), startOfNextSearch(0); |
|
static std::map<unsigned int, std::vector<unsigned int> > searchResult; |
|
searchResult.clear(); |
|
unsigned int streamStart = objectContent.find("stream"); |
|
if(streamStart == string::npos) |
|
streamStart = objectContent.size(); |
|
while(startOfNextSearch < streamStart) |
|
{ |
|
//try to find reference. reference example is 15 0 R |
|
startOfNextSearch = objectContent.find(" R", startOfNextSearch); |
|
currentPosition = startOfNextSearch; |
|
if(currentPosition != std::string::npos) |
|
{ |
|
//check that next character of " R" is WHITESPACE. |
|
|
|
if((WHITESPACES.find(objectContent[currentPosition + 2]) == string::npos) && |
|
(DELIMETERS.find(objectContent[currentPosition + 2]) == string::npos) |
|
) |
|
{ |
|
//this is not reference. this is something looks like "0 0 0 RG" |
|
++startOfNextSearch; |
|
continue; |
|
} |
|
//get previos symbol and check that it is a number |
|
unsigned int numberSearchCounter = _skipNumber(objectContent, --currentPosition); |
|
|
|
//previos symbol is not a number |
|
if(numberSearchCounter == currentPosition) |
|
{ |
|
++startOfNextSearch; |
|
continue; |
|
} |
|
else |
|
{ |
|
currentPosition = numberSearchCounter; |
|
} |
|
|
|
bool isFound(false); |
|
//previos symbols should be whitespaces |
|
while((objectContent[currentPosition] == ' ') && --currentPosition) |
|
{ |
|
isFound = true; |
|
} |
|
|
|
//previos symbol is not a whitespace |
|
if(!isFound) |
|
{ |
|
++startOfNextSearch; |
|
continue; |
|
} |
|
//check that this and may be previos symbols are a numbers |
|
numberSearchCounter = _skipNumber(objectContent, currentPosition); |
|
if(numberSearchCounter == currentPosition) |
|
{ |
|
++startOfNextSearch; |
|
continue; |
|
} |
|
unsigned int objectNumber = Utils::stringToInt(objectContent.substr(numberSearchCounter + 1, currentPosition - numberSearchCounter)); |
|
|
|
searchResult[objectNumber].push_back(numberSearchCounter + 1); |
|
|
|
|
|
++startOfNextSearch; |
|
|
|
} |
|
else |
|
break; |
|
} |
|
return searchResult; |
|
} |
|
|
|
unsigned int Parser::_skipNumber(const std::string & str, unsigned int currentPosition) |
|
{ |
|
unsigned int numberSearchCounter = currentPosition; |
|
while((NUMBERS.find(str[numberSearchCounter]) != string::npos) && --numberSearchCounter) |
|
{} |
|
|
|
return numberSearchCounter; |
|
} |
|
void Parser::_readXRefAndCreateObjects() |
|
{ |
|
unsigned int currentPostion = _getStartOfXrefWithRoot(); |
|
do |
|
{ |
|
const std::string & currentToken = _getNextToken(currentPostion); |
|
if(currentToken != "xref") |
|
{ |
|
throw Exception("Wrong xref in some document"); |
|
} |
|
unsigned int endOfLine = _getEndOfLineFromContent(currentPostion ); |
|
if(_countTokens(currentPostion, endOfLine) != 2) |
|
{ |
|
throw Exception("Wrong xref in some document"); |
|
|
|
} |
|
//now we are reading the xref |
|
while(1) |
|
{ |
|
unsigned int firstObjectNumber = Utils::stringToInt(_getNextToken(currentPostion)); |
|
unsigned int objectCount = Utils::stringToInt(_getNextToken(currentPostion)); |
|
for(unsigned int i(0); i < objectCount; i++) |
|
{ |
|
unsigned long first; |
|
unsigned long second; |
|
|
|
if(_countTokens(currentPostion, _getEndOfLineFromContent(currentPostion)) == 3) |
|
{ |
|
first = Utils::stringToInt(_getNextToken(currentPostion)); |
|
second = Utils::stringToInt(_getNextToken(currentPostion)); |
|
const string & use = _getNextToken(currentPostion); |
|
if(!use.compare("n")) |
|
{ |
|
unsigned int objectNumber; |
|
|
|
try |
|
{ |
|
std::pair<unsigned int, unsigned int> streamBounds; |
|
bool hasObjectStream; |
|
unsigned int generationNumber; |
|
const std::string content = _getObjectContent(first, objectNumber, generationNumber, streamBounds, hasObjectStream); |
|
if(!_objects.count(objectNumber)) |
|
{ |
|
Object * newObject = new Object(objectNumber, generationNumber, content, _document->_documentName ,streamBounds, hasObjectStream); |
|
_objects[objectNumber] = newObject; |
|
} |
|
} |
|
catch(std::exception &) |
|
{ |
|
} |
|
|
|
} |
|
} |
|
else |
|
{ |
|
; |
|
} |
|
++currentPostion; |
|
|
|
|
|
} |
|
unsigned int previosPostion = currentPostion; |
|
const std::string & isTrailer = _getNextToken(currentPostion); |
|
|
|
std::string trailer("trailer"); |
|
if(isTrailer == trailer) |
|
{ |
|
currentPostion -= trailer.size(); |
|
break; |
|
} |
|
else |
|
currentPostion = previosPostion; |
|
|
|
} |
|
} |
|
while(_readTrailerAndRterievePrev(currentPostion, currentPostion)); |
|
|
|
|
|
} |
|
|
|
unsigned int Parser::_getStartOfXrefWithRoot() |
|
{ |
|
unsigned int leftBoundOfStartOfXref = _fileContent.rfind("startxref"); |
|
leftBoundOfStartOfXref = _fileContent.find_first_of(NUMBERS, leftBoundOfStartOfXref); |
|
|
|
unsigned int rightBoundOfStartOfXref = _fileContent.find_first_not_of(NUMBERS, leftBoundOfStartOfXref + 1); |
|
|
|
std::string startOfXref = _fileContent.substr(leftBoundOfStartOfXref, rightBoundOfStartOfXref - leftBoundOfStartOfXref); |
|
int integerStartOfXref = Utils::stringToInt(startOfXref); |
|
return integerStartOfXref; |
|
} |
|
|
|
unsigned int Parser::_getEndOfLineFromContent(unsigned int fromPosition) |
|
{ |
|
fromPosition = _skipWhiteSpacesFromContent(fromPosition); |
|
unsigned int endOfLine = _fileContent.find_first_of("\n\r", fromPosition); |
|
endOfLine = _fileContent.find_last_of("\n\r", endOfLine); |
|
return endOfLine; |
|
|
|
} |
|
|
|
const std::pair<unsigned int, unsigned int> & Parser::_getLineBounds(const std::string & str, unsigned int fromPosition) |
|
{ |
|
static std::pair<unsigned int, unsigned int> bounds; |
|
bounds.first = str.rfind('\n', fromPosition); |
|
if(bounds.first == string::npos) |
|
bounds.first = 0; |
|
bounds.second = str.find('\n', fromPosition); |
|
if(bounds.second == string::npos) |
|
bounds.second = str.size(); |
|
return bounds; |
|
} |
|
|
|
const std::string & Parser::_getNextToken(unsigned int & fromPosition) |
|
{ |
|
fromPosition = _skipWhiteSpacesFromContent(fromPosition); |
|
unsigned int position = _fileContent.find_first_of(WHITESPACES, fromPosition); |
|
|
|
static std::string token; |
|
if(position > fromPosition) |
|
{ |
|
unsigned int tokenSize = position - fromPosition; |
|
token.resize(tokenSize); |
|
memcpy(&token[0], &_fileContent[fromPosition], tokenSize); |
|
fromPosition = position; |
|
return token; |
|
} |
|
else |
|
{ |
|
//TODO throw exception |
|
} |
|
token = ""; |
|
return token; |
|
} |
|
|
|
unsigned int Parser::_countTokens(unsigned int leftBound, unsigned int rightBount) |
|
{ |
|
unsigned int position = _skipWhiteSpacesFromContent(leftBound); |
|
unsigned int tokensCount = 0; |
|
|
|
while (position < rightBount) |
|
{ |
|
position = _fileContent.find_first_of(WHITESPACES, position); |
|
if (position != string::npos) |
|
++tokensCount; |
|
//start search from next symbol |
|
++position; |
|
} |
|
return tokensCount; |
|
} |
|
|
|
unsigned int Parser::_skipWhiteSpaces(const std::string & str, unsigned int fromPosition) |
|
{ |
|
unsigned int position = fromPosition; |
|
if(WHITESPACES.find(str[0]) != string::npos) |
|
position = str.find_first_not_of(WHITESPACES, position); |
|
return position; |
|
} |
|
|
|
unsigned int Parser::_skipWhiteSpacesFromContent(unsigned int fromPosition) |
|
{ |
|
unsigned int position = fromPosition; |
|
if(WHITESPACES.find(_fileContent[position]) != string::npos) |
|
position = _fileContent.find_first_not_of(WHITESPACES, position);// + 1; |
|
|
|
return position; |
|
} |
|
|
|
const std::string & Parser::_getObjectContent(unsigned int objectPosition, unsigned int & objectNumber, unsigned int & generationNumber, std::pair<unsigned int, unsigned int> & streamBounds, bool & hasObjectStream) |
|
{ |
|
hasObjectStream = false; |
|
unsigned int currentPosition = objectPosition; |
|
|
|
std::string token = _getNextToken(currentPosition); // number of object |
|
objectNumber = Utils::stringToInt(token); |
|
|
|
token = _getNextToken(currentPosition); // generation number - not interesting |
|
generationNumber = Utils::stringToInt(token); |
|
|
|
token = Parser::getNextToken(_fileContent,currentPosition); |
|
|
|
if( token != "obj" ) |
|
{ |
|
std::stringstream strOut; |
|
strOut<<"Wrong object in PDF, in position "<<currentPosition<<" cannot continue!\n"; |
|
throw Exception(strOut.str()); |
|
} |
|
|
|
static std::string objectContent; |
|
|
|
size_t contentStart = _fileContent.find_first_not_of(Parser::WHITESPACES,currentPosition); |
|
if( contentStart == std::string::npos ) |
|
{ |
|
std::stringstream strOut; |
|
strOut<<"Wrong object "<< objectNumber<< "in PDF, cannot find content for it\n"; |
|
throw Exception(strOut.str()); |
|
} |
|
currentPosition = contentStart; |
|
unsigned int endOfContent = _fileContent.find("endobj", contentStart); |
|
if( endOfContent == std::string::npos ) |
|
{ |
|
stringstream errorMessage("Corrupted PDF file, obj does not have matching endobj"); |
|
throw Exception(errorMessage); |
|
} |
|
unsigned int endOfStream = _fileContent.find("endstream", currentPosition); |
|
if((endOfStream != std::string::npos) && (endOfStream < endOfContent)) |
|
{ |
|
std::string stream("stream"); |
|
unsigned int beginOfStream = _fileContent.find(stream, currentPosition) + stream.size(); |
|
while(_fileContent[beginOfStream] == '\r') |
|
{ |
|
++beginOfStream; |
|
} |
|
if( _fileContent[beginOfStream] == '\n') |
|
{ |
|
++beginOfStream; |
|
} |
|
streamBounds.first = beginOfStream; |
|
|
|
// try to use Length field to determine end of stream. |
|
std::string lengthToken = "/Length"; |
|
size_t lengthBegin = Parser::findTokenName(_fileContent,lengthToken,contentStart); |
|
if ( lengthBegin != std::string::npos ) |
|
{ |
|
std::string lengthStr; |
|
size_t lenPos = lengthBegin + lengthToken.size(); |
|
bool useContentLength = false; |
|
if( Parser::getNextWord(lengthStr,_fileContent,lenPos) ) |
|
{ |
|
useContentLength = true; |
|
std::string refStr; |
|
if( Parser::getNextWord(refStr,_fileContent,lenPos)) |
|
{ |
|
if( Parser::getNextWord(refStr,_fileContent,lenPos)) |
|
{ |
|
if( refStr == "R" ) |
|
{ |
|
useContentLength = false; |
|
//it is reference |
|
} |
|
} |
|
} |
|
} |
|
if( useContentLength ) |
|
{ |
|
std::stringstream strin(lengthStr); |
|
unsigned int streamEnd = 0; |
|
strin>>streamEnd; |
|
streamEnd += beginOfStream; |
|
unsigned int streamEndBegin = _fileContent.find("endstream",streamEnd); |
|
if( streamEndBegin != std::string::npos ) |
|
{ |
|
endOfStream = streamEndBegin; |
|
} |
|
} |
|
} |
|
streamBounds.second = endOfStream; |
|
endOfContent = beginOfStream; |
|
hasObjectStream = true; |
|
|
|
} |
|
unsigned int contentSize = endOfContent - currentPosition; |
|
|
|
objectContent.resize(contentSize); |
|
memcpy(&objectContent[0], &_fileContent[currentPosition], contentSize); |
|
return objectContent; |
|
|
|
} |
|
|
|
unsigned int Parser::_readTrailerAndReturnRoot() |
|
{ |
|
|
|
unsigned int startOfTrailer = Parser::findToken(_fileContent,"trailer", _getStartOfXrefWithRoot()); |
|
std::string rootStr("/Root"); |
|
unsigned int startOfRoot = Parser::findToken(_fileContent,rootStr.data(), startOfTrailer); |
|
if( startOfRoot == std::string::npos) |
|
{ |
|
throw Exception("Cannot find Root object !"); |
|
} |
|
std::string encryptStr("/Encrypt"); |
|
if( Parser::findToken(_fileContent,encryptStr,startOfTrailer) != std::string::npos ) |
|
{ |
|
throw Exception("Encrypted PDF is not supported!"); |
|
} |
|
startOfRoot += rootStr.size()+1; //"/Root + ' ' |
|
unsigned int endOfRoot = startOfRoot; |
|
while(NUMBERS.find(_fileContent[endOfRoot++]) != string::npos) |
|
{} |
|
--endOfRoot; |
|
return Utils::stringToInt(_fileContent.substr(startOfRoot, endOfRoot - startOfRoot)); |
|
} |
|
|
|
unsigned int Parser::_readTrailerAndRterievePrev(const unsigned int startPositionForSearch, unsigned int & previosXref) |
|
{ |
|
unsigned int startOfTrailer = Parser::findToken(_fileContent,"trailer", startPositionForSearch); |
|
if( startOfTrailer == std::string::npos ) |
|
{ |
|
throw Exception("Cannot find trailer!"); |
|
} |
|
|
|
unsigned int startOfPrev = _fileContent.find("Prev ", startOfTrailer); |
|
unsigned int startxref = _fileContent.find("startxref", startOfTrailer); |
|
if(startOfPrev == string::npos || (startOfPrev > startxref)) |
|
return false; |
|
//"Prev "s length = 5 |
|
else |
|
startOfPrev += 5; |
|
|
|
unsigned int endOfPrev = startOfPrev; |
|
while(NUMBERS.find(_fileContent[endOfPrev++]) != string::npos) |
|
{} |
|
--endOfPrev; |
|
previosXref = Utils::stringToInt(_fileContent.substr(startOfPrev, endOfPrev - startOfPrev)); |
|
return true; |
|
} |
|
|
|
//Method finds the token from current position from string |
|
// It uses PDF whitespaces and delimeters to recognize |
|
// Returned string without begin/end spaces |
|
std::string Parser::getNextToken(const std::string &str, unsigned int &position) |
|
{ |
|
if( position >= str.size() ) |
|
{ |
|
return ""; |
|
} |
|
//skip first spaces |
|
size_t beg_pos = str.find_first_not_of(Parser::WHITESPACES,position); |
|
if ( beg_pos == std::string::npos ) |
|
{ |
|
// it is empty string! |
|
return ""; |
|
} |
|
size_t end_pos = str.find_first_of(Parser::WHITESPACES_AND_DELIMETERS,beg_pos); |
|
if ( end_pos == std::string::npos ) |
|
{ |
|
end_pos = str.size(); |
|
} |
|
position = end_pos; |
|
|
|
std::string out = str.substr(beg_pos,end_pos - beg_pos); |
|
Parser::trim(out); |
|
return out; |
|
} |
|
/** @brief getNextWord |
|
* |
|
* method finds and returns next word from the string |
|
* For example: " 1 0 R \n" will return "1" , then "0" then "R" |
|
*/ |
|
bool Parser::getNextWord(std::string &out, const std::string &str, size_t &nextPosition, size_t *found) |
|
{ |
|
if( found ) |
|
{ |
|
*found = std::string::npos; |
|
} |
|
//trace("position = %d",position); |
|
if( nextPosition >= str.size() ) |
|
{ |
|
return false; |
|
} |
|
//skip first spaces |
|
size_t beg_pos = str.find_first_not_of(Parser::WHITESPACES,nextPosition); |
|
if ( beg_pos == std::string::npos ) |
|
{ |
|
// it is empty string! |
|
return false; |
|
} |
|
if( found ) |
|
{ |
|
*found = beg_pos; |
|
} |
|
size_t end_pos = str.find_first_of(Parser::WHITESPACES,beg_pos); |
|
|
|
if ( end_pos == std::string::npos ) |
|
{ |
|
end_pos = str.size(); |
|
} |
|
nextPosition = end_pos; |
|
out = str.substr(beg_pos,end_pos - beg_pos); |
|
Parser::trim(out); |
|
if( out.empty() ) |
|
{ |
|
return false; |
|
} |
|
return true; |
|
} |
|
|
|
/** @brief trim |
|
* |
|
* @todo: document this function |
|
*/ |
|
void Parser::trim(std::string &str) |
|
{ |
|
std::string::size_type pos1 = str.find_first_not_of(WHITESPACES); |
|
std::string::size_type pos2 = str.find_last_not_of(WHITESPACES); |
|
str = str.substr(pos1 == std::string::npos ? 0 : pos1, |
|
pos2 == std::string::npos ? str.length() - 1 : pos2 - pos1 + 1); |
|
} |
|
|
|
// Method tries to find the PDF token from the content |
|
// The token is "/L 12 0R" or /Length 123 |
|
std::string Parser::findTokenStr(const std::string &content, const std::string &pattern, size_t start, size_t &foundStart, size_t &foundEnd) |
|
{ |
|
size_t cur_pos = Parser::findToken(content,pattern,start); |
|
if( cur_pos == std::string::npos ) |
|
{ |
|
return ""; |
|
} |
|
foundStart = cur_pos; |
|
cur_pos += pattern.size(); |
|
// then lets parse the content of remaining part |
|
size_t end_pos = content.find_first_of(Parser::DELIMETERS,cur_pos); |
|
if( end_pos == std::string::npos ) |
|
{ |
|
end_pos = content.size(); |
|
} |
|
std::string token = content.substr(cur_pos,end_pos-cur_pos); |
|
foundEnd = end_pos -1; |
|
return token; |
|
} |
|
|
|
// Method tries to find token in the string from specified position, |
|
// returns position of first occurent or npos if not found |
|
// It properly handles cases when content contains strings which |
|
// contains token but not euqal to it |
|
// Example: content "/Transparency/ ..." pattern "/Trans |
|
// will return npos. |
|
size_t Parser::findToken(const std::string &content, const std::string &keyword,size_t start) |
|
{ |
|
size_t cur_pos = start; |
|
// lets find pattern first |
|
size_t foundStart = std::string::npos; |
|
size_t savedPos = 0; |
|
while( 1 ) |
|
{ |
|
cur_pos = content.find(keyword,cur_pos); |
|
if( cur_pos == std::string::npos ) |
|
{ |
|
break; |
|
} |
|
savedPos = cur_pos; |
|
cur_pos += keyword.size(); |
|
if( cur_pos < content.size() ) |
|
{ |
|
if( Parser::WHITESPACES.find(content[cur_pos]) != std::string::npos || |
|
Parser::DELIMETERS.find(content[cur_pos]) != std::string::npos ) |
|
{ |
|
foundStart = savedPos; |
|
break; |
|
} |
|
} |
|
else |
|
{ |
|
foundStart = savedPos; |
|
// end of line is reached |
|
break; |
|
} |
|
} |
|
return foundStart; |
|
} |
|
|
|
// Method checks if token at current position can be a Name or it is not name but value |
|
// Example |
|
// /H /P /P 12 0 R |
|
// the tag /P can be a name (and a value also), while 12 cannot |
|
// start defines the position of token content |
|
bool Parser::tokenIsAName(const std::string &content, size_t start ) |
|
{ |
|
std::string openBraces = "<[({"; |
|
bool found = false; |
|
while(1) |
|
{ |
|
size_t foundNonWhite = content.find_first_not_of(Parser::WHITESPACES,start); |
|
size_t foundDelim = content.find_first_of(Parser::DELIMETERS,start); |
|
|
|
if( foundNonWhite != std::string::npos && |
|
foundDelim != std::string::npos ) |
|
{ |
|
if( (foundNonWhite < foundDelim ) || ( openBraces.find(content[foundDelim]) != std::string::npos) ) |
|
{ |
|
if( found ) |
|
{ |
|
return false; |
|
} |
|
else |
|
{ |
|
return true; |
|
} |
|
} |
|
else |
|
{ |
|
if( found ) |
|
{ |
|
return true; |
|
} |
|
else |
|
{ |
|
found = true; |
|
start = content.find_first_of(Parser::WHITESPACES_AND_DELIMETERS,foundDelim+1); |
|
} |
|
} |
|
} |
|
else |
|
{ |
|
return true; |
|
} |
|
} |
|
} |
|
|
|
// Method tries to find token name in the string from specified position, |
|
// For example, the string contains /H /P /P 12 0 R. |
|
// If search for /P then it will return position of /P 12 0 R, not value of |
|
// /H /P |
|
size_t Parser::findTokenName(const std::string &content, const std::string &keyword,size_t start) |
|
{ |
|
size_t cur_pos = start; |
|
// lets find pattern first |
|
size_t foundStart = std::string::npos; |
|
size_t savedPos = 0; |
|
std::string braces = "<[({"; |
|
while( 1 ) |
|
{ |
|
cur_pos = content.find(keyword,cur_pos); |
|
if( cur_pos == std::string::npos ) |
|
{ |
|
break; |
|
} |
|
savedPos = cur_pos; |
|
cur_pos += keyword.size(); |
|
if( cur_pos < content.size() ) |
|
{ |
|
if( Parser::WHITESPACES_AND_DELIMETERS.find(content[cur_pos]) != std::string::npos ) |
|
{ |
|
if( tokenIsAName(content,cur_pos ) ) |
|
{ |
|
foundStart = savedPos; |
|
break; |
|
} |
|
} |
|
} |
|
else |
|
{ |
|
foundStart = savedPos; |
|
// end of line is reached |
|
break; |
|
} |
|
} |
|
return foundStart; |
|
} |
|
|
|
unsigned int Parser::findEndOfElementContent(const std::string &content,unsigned int startOfPageElement) |
|
{ |
|
unsigned int foundEnd = std::string::npos; |
|
std::stack<std::string> delimStack; |
|
std::string endDelim = "/]>)}"; |
|
unsigned int curPos = startOfPageElement; |
|
std::string openDict("<"); |
|
std::string openArray("["); |
|
std::string delimeter = endDelim; |
|
|
|
delimStack.push(delimeter); //initial delimeter |
|
|
|
bool compensation = true; |
|
while(1) |
|
{ |
|
unsigned int nonWhiteSpace = content.find_first_not_of(Parser::WHITESPACES,curPos); |
|
|
|
unsigned int foundDelimeter = content.find_first_of(delimeter,curPos); |
|
unsigned int foundOpenBrace = content.find("[",curPos); |
|
unsigned int foundOpenDict = content.find("<",curPos); |
|
|
|
if( foundDelimeter == std::string::npos && foundOpenBrace == std::string::npos && foundOpenDict == std::string::npos ) |
|
{ |
|
if( !delimStack.empty() ) |
|
{ |
|
delimStack.pop(); |
|
} |
|
} |
|
else if( (foundDelimeter <= foundOpenBrace && foundDelimeter <= foundOpenDict ) ) |
|
{ |
|
if( !delimStack.empty() ) |
|
{ |
|
delimStack.pop(); |
|
} |
|
if( nonWhiteSpace == foundDelimeter && delimeter == endDelim ) |
|
{ |
|
curPos = foundDelimeter; |
|
if(content[foundDelimeter] == '/' && compensation ) |
|
{ |
|
curPos ++; |
|
compensation = false; |
|
} |
|
} |
|
else |
|
{ |
|
compensation = false; |
|
if( delimeter == endDelim ) |
|
{ |
|
curPos = foundDelimeter; |
|
} |
|
else |
|
{ |
|
curPos = foundDelimeter + delimeter.size(); |
|
} |
|
} |
|
} |
|
else if( foundOpenBrace <= foundDelimeter && foundOpenBrace <= foundOpenDict ) |
|
{ |
|
compensation = false; |
|
delimStack.push("]"); |
|
curPos = foundOpenBrace + openArray.size(); |
|
} |
|
else if( foundOpenDict <= foundDelimeter && foundOpenDict <= foundOpenBrace ) |
|
{ |
|
compensation = false; |
|
delimStack.push(">"); |
|
curPos = foundOpenDict + openDict.size(); |
|
} |
|
if( delimStack.empty() ) |
|
{ |
|
foundEnd = content.find_first_of(delimeter,curPos); |
|
if( foundEnd == std::string::npos ) |
|
{ |
|
foundEnd = curPos; |
|
} |
|
break; |
|
} |
|
delimeter = delimStack.top(); |
|
|
|
} |
|
return foundEnd; |
|
} |
|
|
|
|