You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1278 lines
30 KiB
1278 lines
30 KiB
//======================================================================== |
|
// |
|
// TextOutputDev.cc |
|
// |
|
// Copyright 1997-2002 Glyph & Cog, LLC |
|
// |
|
//======================================================================== |
|
|
|
#include <aconf.h> |
|
|
|
#ifdef USE_GCC_PRAGMAS |
|
#pragma implementation |
|
#endif |
|
|
|
#include <stdio.h> |
|
#include <stdlib.h> |
|
#include <stddef.h> |
|
#include <math.h> |
|
#include <ctype.h> |
|
#include "GString.h" |
|
#include "gmem.h" |
|
#include "config.h" |
|
#include "Error.h" |
|
#include "GlobalParams.h" |
|
#include "UnicodeMap.h" |
|
#include "GfxState.h" |
|
#include "TextOutputDev.h" |
|
|
|
#ifdef MACOS |
|
// needed for setting type/creator of MacOS files |
|
#include "ICSupport.h" |
|
#endif |
|
|
|
//------------------------------------------------------------------------ |
|
|
|
#define textOutSpace 0.2 |
|
#define textOutColSpace 0.2 |
|
|
|
//------------------------------------------------------------------------ |
|
|
|
struct TextOutColumnEdge { |
|
double x, y0, y1; |
|
}; |
|
|
|
//------------------------------------------------------------------------ |
|
// TextBlock |
|
//------------------------------------------------------------------------ |
|
|
|
class TextBlock { |
|
public: |
|
|
|
TextBlock(); |
|
~TextBlock(); |
|
|
|
double xMin, xMax; |
|
double yMin, yMax; |
|
TextString *strings; // list of strings in the block |
|
TextBlock *next; // next block in line |
|
TextBlock *xyNext; // next block on xyBlocks list |
|
Unicode *text; // Unicode text of the block, including |
|
// spaces between strings |
|
double *xRight; // right-hand x coord of each char |
|
int len; // total number of Unicode characters |
|
int convertedLen; // total number of converted characters |
|
int *col; // starting column number for each |
|
// Unicode character |
|
}; |
|
|
|
TextBlock::TextBlock() { |
|
strings = NULL; |
|
next = NULL; |
|
xyNext = NULL; |
|
text = NULL; |
|
xRight = NULL; |
|
col = NULL; |
|
} |
|
|
|
TextBlock::~TextBlock() { |
|
TextString *p1, *p2; |
|
|
|
for (p1 = strings; p1; p1 = p2) { |
|
p2 = p1->next; |
|
delete p1; |
|
} |
|
gfree(text); |
|
gfree(xRight); |
|
gfree(col); |
|
} |
|
|
|
//------------------------------------------------------------------------ |
|
// TextLine |
|
//------------------------------------------------------------------------ |
|
|
|
class TextLine { |
|
public: |
|
|
|
TextLine(); |
|
~TextLine(); |
|
|
|
TextBlock *blocks; |
|
TextLine *next; |
|
double yMin, yMax; |
|
}; |
|
|
|
TextLine::TextLine() { |
|
blocks = NULL; |
|
next = NULL; |
|
} |
|
|
|
TextLine::~TextLine() { |
|
TextBlock *p1, *p2; |
|
|
|
for (p1 = blocks; p1; p1 = p2) { |
|
p2 = p1->next; |
|
delete p1; |
|
} |
|
} |
|
|
|
//------------------------------------------------------------------------ |
|
// TextString |
|
//------------------------------------------------------------------------ |
|
|
|
TextString::TextString(GfxState *state, double x0, double y0, |
|
double fontSize) { |
|
GfxFont *font; |
|
double x, y; |
|
|
|
state->transform(x0, y0, &x, &y); |
|
if ((font = state->getFont())) { |
|
yMin = y - font->getAscent() * fontSize; |
|
yMax = y - font->getDescent() * fontSize; |
|
} else { |
|
// this means that the PDF file draws text without a current font, |
|
// which should never happen |
|
yMin = y - 0.95 * fontSize; |
|
yMax = y + 0.35 * fontSize; |
|
} |
|
if (yMin == yMax) { |
|
// this is a sanity check for a case that shouldn't happen -- but |
|
// if it does happen, we want to avoid dividing by zero later |
|
yMin = y; |
|
yMax = y + 1; |
|
} |
|
marked = gFalse; |
|
text = NULL; |
|
xRight = NULL; |
|
len = size = 0; |
|
next = NULL; |
|
} |
|
|
|
|
|
TextString::~TextString() { |
|
gfree(text); |
|
gfree(xRight); |
|
} |
|
|
|
void TextString::addChar(GfxState *state, double x, double y, |
|
double dx, double dy, Unicode u) { |
|
if (len == size) { |
|
size += 16; |
|
text = (Unicode *)grealloc(text, size * sizeof(Unicode)); |
|
xRight = (double *)grealloc(xRight, size * sizeof(double)); |
|
} |
|
text[len] = u; |
|
if (len == 0) { |
|
xMin = x; |
|
} |
|
xMax = xRight[len] = x + dx; |
|
++len; |
|
} |
|
|
|
//------------------------------------------------------------------------ |
|
// TextPage |
|
//------------------------------------------------------------------------ |
|
|
|
TextPage::TextPage(GBool rawOrderA) { |
|
rawOrder = rawOrderA; |
|
curStr = NULL; |
|
fontSize = 0; |
|
xyStrings = NULL; |
|
xyCur1 = xyCur2 = NULL; |
|
lines = NULL; |
|
nest = 0; |
|
nTinyChars = 0; |
|
} |
|
|
|
TextPage::~TextPage() { |
|
clear(); |
|
} |
|
|
|
void TextPage::updateFont(GfxState *state) { |
|
GfxFont *font; |
|
double *fm; |
|
char *name; |
|
int code, mCode, letterCode, anyCode; |
|
double w; |
|
|
|
// adjust the font size |
|
fontSize = state->getTransformedFontSize(); |
|
if ((font = state->getFont()) && font->getType() == fontType3) { |
|
// This is a hack which makes it possible to deal with some Type 3 |
|
// fonts. The problem is that it's impossible to know what the |
|
// base coordinate system used in the font is without actually |
|
// rendering the font. This code tries to guess by looking at the |
|
// width of the character 'm' (which breaks if the font is a |
|
// subset that doesn't contain 'm'). |
|
mCode = letterCode = anyCode = -1; |
|
for (code = 0; code < 256; ++code) { |
|
name = ((Gfx8BitFont *)font)->getCharName(code); |
|
if (name && name[0] == 'm' && name[1] == '\0') { |
|
mCode = code; |
|
} |
|
if (letterCode < 0 && name && name[1] == '\0' && |
|
((name[0] >= 'A' && name[0] <= 'Z') || |
|
(name[0] >= 'a' && name[0] <= 'z'))) { |
|
letterCode = code; |
|
} |
|
if (anyCode < 0 && name && ((Gfx8BitFont *)font)->getWidth(code) > 0) { |
|
anyCode = code; |
|
} |
|
} |
|
if (mCode >= 0 && |
|
(w = ((Gfx8BitFont *)font)->getWidth(mCode)) > 0) { |
|
// 0.6 is a generic average 'm' width -- yes, this is a hack |
|
fontSize *= w / 0.6; |
|
} else if (letterCode >= 0 && |
|
(w = ((Gfx8BitFont *)font)->getWidth(letterCode)) > 0) { |
|
// even more of a hack: 0.5 is a generic letter width |
|
fontSize *= w / 0.5; |
|
} else if (anyCode >= 0 && |
|
(w = ((Gfx8BitFont *)font)->getWidth(anyCode)) > 0) { |
|
// better than nothing: 0.5 is a generic character width |
|
fontSize *= w / 0.5; |
|
} |
|
fm = font->getFontMatrix(); |
|
if (fm[0] != 0) { |
|
fontSize *= fabs(fm[3] / fm[0]); |
|
} |
|
} |
|
} |
|
|
|
void TextPage::beginString(GfxState *state, double x0, double y0) { |
|
// This check is needed because Type 3 characters can contain |
|
// text-drawing operations. |
|
if (curStr) { |
|
++nest; |
|
return; |
|
} |
|
|
|
curStr = new TextString(state, x0, y0, fontSize); |
|
} |
|
|
|
void TextPage::addChar(GfxState *state, double x, double y, |
|
double dx, double dy, Unicode *u, int uLen) { |
|
if (! curStr) return; |
|
|
|
double x1, y1, w1, h1, dx2, dy2; |
|
int n, i; |
|
|
|
state->transform(x, y, &x1, &y1); |
|
if (x1 < 0 || x1 > state->getPageWidth() || |
|
y1 < 0 || y1 > state->getPageHeight()) { |
|
return; |
|
} |
|
state->textTransformDelta(state->getCharSpace() * state->getHorizScaling(), |
|
0, &dx2, &dy2); |
|
dx -= dx2; |
|
dy -= dy2; |
|
state->transformDelta(dx, dy, &w1, &h1); |
|
if (!globalParams->getTextKeepTinyChars() && |
|
fabs(w1) < 3 && fabs(h1) < 3) { |
|
if (++nTinyChars > 20000) { |
|
return; |
|
} |
|
} |
|
n = curStr->len; |
|
if (n > 0 && x1 - curStr->xRight[n-1] > |
|
0.1 * (curStr->yMax - curStr->yMin)) { |
|
// large char spacing is sometimes used to move text around |
|
endString(); |
|
beginString(state, x, y); |
|
} |
|
if (uLen == 1 && u[0] == (Unicode)0x20 && |
|
w1 > 0.5 * (curStr->yMax - curStr->yMin)) { |
|
// large word spacing is sometimes used to move text around |
|
return; |
|
} |
|
if (uLen != 0) { |
|
w1 /= uLen; |
|
h1 /= uLen; |
|
} |
|
for (i = 0; i < uLen; ++i) { |
|
curStr->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u[i]); |
|
} |
|
} |
|
|
|
void TextPage::endString() { |
|
// This check is needed because Type 3 characters can contain |
|
// text-drawing operations. |
|
if (nest > 0) { |
|
--nest; |
|
return; |
|
} |
|
|
|
addString(curStr); |
|
curStr = NULL; |
|
} |
|
|
|
void TextPage::addString(TextString *str) { |
|
if (! str) return; |
|
|
|
TextString *p1, *p2; |
|
|
|
// throw away zero-length strings -- they don't have valid xMin/xMax |
|
// values, and they're useless anyway |
|
if (str->len == 0) { |
|
delete str; |
|
return; |
|
} |
|
|
|
// insert string in xy list |
|
if (rawOrder) { |
|
p1 = xyCur1; |
|
p2 = NULL; |
|
} else if ((!xyCur1 || xyBefore(xyCur1, str)) && |
|
(!xyCur2 || xyBefore(str, xyCur2))) { |
|
p1 = xyCur1; |
|
p2 = xyCur2; |
|
} else if (xyCur1 && xyBefore(xyCur1, str)) { |
|
for (p1 = xyCur1, p2 = xyCur2; p2; p1 = p2, p2 = p2->next) { |
|
if (xyBefore(str, p2)) { |
|
break; |
|
} |
|
} |
|
xyCur2 = p2; |
|
} else { |
|
for (p1 = NULL, p2 = xyStrings; p2; p1 = p2, p2 = p2->next) { |
|
if (xyBefore(str, p2)) { |
|
break; |
|
} |
|
} |
|
xyCur2 = p2; |
|
} |
|
xyCur1 = str; |
|
if (p1) { |
|
p1->next = str; |
|
} else { |
|
xyStrings = str; |
|
} |
|
str->next = p2; |
|
} |
|
|
|
void TextPage::coalesce() { |
|
TextLine *line, *line0; |
|
TextBlock *yxBlocks, *xyBlocks, *blk, *blk0, *blk1, *blk2; |
|
TextString *str0, *str1, *str2, *str3, *str4; |
|
TextString *str1prev, *str2prev, *str3prev; |
|
TextOutColumnEdge *edges; |
|
UnicodeMap *uMap; |
|
GBool isUnicode; |
|
char buf[8]; |
|
int edgesLength, edgesSize; |
|
double x, yMin, yMax; |
|
double space, fit1, fit2, h; |
|
int col1, col2, d; |
|
int i, j; |
|
|
|
#if 0 //~ for debugging |
|
for (str1 = xyStrings; str1; str1 = str1->next) { |
|
printf("x=%.2f..%.2f y=%.2f..%.2f size=%.2f '", |
|
str1->xMin, str1->xMax, str1->yMin, str1->yMax, |
|
(str1->yMax - str1->yMin)); |
|
for (i = 0; i < str1->len; ++i) { |
|
fputc(str1->text[i] & 0xff, stdout); |
|
} |
|
printf("'\n"); |
|
} |
|
printf("\n------------------------------------------------------------\n\n"); |
|
#endif |
|
|
|
// build the list of column edges |
|
edges = NULL; |
|
edgesLength = edgesSize = 0; |
|
if (!rawOrder) { |
|
for (str1prev = NULL, str1 = xyStrings; |
|
str1; |
|
str1prev = str1, str1 = str1->next) { |
|
if (str1->marked) { |
|
continue; |
|
} |
|
h = str1->yMax - str1->yMin; |
|
if (str1prev && (str1->xMin - str1prev->xMax) / h < textOutColSpace) { |
|
continue; |
|
} |
|
x = str1->xMin; |
|
yMin = str1->yMin; |
|
yMax = str1->yMax; |
|
for (str2prev = str1, str2 = str1->next; |
|
str2; |
|
str2prev = str2, str2 = str2->next) { |
|
h = str2->yMax - str2->yMin; |
|
if (!str2->marked && |
|
(str2->xMin - str2prev->xMax) / h > textOutColSpace && |
|
fabs(str2->xMin - x) < 0.5 && |
|
str2->yMin - yMax < 0.3 * h && |
|
yMin - str2->yMax < 0.3 * h) { |
|
break; |
|
} |
|
} |
|
if (str2) { |
|
if (str2->yMin < yMin) { |
|
yMin = str2->yMin; |
|
} |
|
if (str2->yMax > yMax) { |
|
yMax = str2->yMax; |
|
} |
|
str2->marked = gTrue; |
|
for (str3prev = str1, str3 = str1->next; |
|
str3; |
|
str3prev = str3, str3 = str3->next) { |
|
h = str3->yMax - str3->yMin; |
|
if (!str3->marked && |
|
(str3->xMin - str3prev->xMax) / h > textOutColSpace && |
|
fabs(str3->xMin - x) < 0.5 && |
|
str3->yMin - yMax < 0.3 * h && |
|
yMin - str3->yMax < 0.3 * h) { |
|
break; |
|
} |
|
} |
|
if (str3) { |
|
if (str3->yMin < yMin) { |
|
yMin = str3->yMin; |
|
} |
|
if (str3->yMax > yMax) { |
|
yMax = str3->yMax; |
|
} |
|
str3->marked = gTrue; |
|
do { |
|
for (str2prev = str1, str2 = str1->next; |
|
str2; |
|
str2prev = str2, str2 = str2->next) { |
|
h = str2->yMax - str2->yMin; |
|
if (!str2->marked && |
|
(str2->xMin - str2prev->xMax) / h > textOutColSpace && |
|
fabs(str2->xMin - x) < 0.5 && |
|
str2->yMin - yMax < 0.3 * h && |
|
yMin - str2->yMax < 0.3 * h) { |
|
if (str2->yMin < yMin) { |
|
yMin = str2->yMin; |
|
} |
|
if (str2->yMax > yMax) { |
|
yMax = str2->yMax; |
|
} |
|
str2->marked = gTrue; |
|
break; |
|
} |
|
} |
|
} while (str2); |
|
if (edgesLength == edgesSize) { |
|
edgesSize = edgesSize ? 2 * edgesSize : 16; |
|
edges = (TextOutColumnEdge *) |
|
grealloc(edges, edgesSize * sizeof(TextOutColumnEdge)); |
|
} |
|
edges[edgesLength].x = x; |
|
edges[edgesLength].y0 = yMin; |
|
edges[edgesLength].y1 = yMax; |
|
++edgesLength; |
|
} else { |
|
str2->marked = gFalse; |
|
} |
|
} |
|
str1->marked = gTrue; |
|
} |
|
} |
|
|
|
#if 0 //~ for debugging |
|
printf("column edges:\n"); |
|
for (i = 0; i < edgesLength; ++i) { |
|
printf("%d: x=%.2f y0=%.2f y1=%.2f\n", |
|
i, edges[i].x, edges[i].y0, edges[i].y1); |
|
} |
|
printf("\n------------------------------------------------------------\n\n"); |
|
#endif |
|
|
|
// build the blocks |
|
yxBlocks = NULL; |
|
blk1 = blk2 = NULL; |
|
while (xyStrings) { |
|
|
|
// build the block |
|
str0 = xyStrings; |
|
xyStrings = xyStrings->next; |
|
str0->next = NULL; |
|
blk = new TextBlock(); |
|
blk->strings = str0; |
|
blk->xMin = str0->xMin; |
|
blk->xMax = str0->xMax; |
|
blk->yMin = str0->yMin; |
|
blk->yMax = str0->yMax; |
|
while (xyStrings) { |
|
str1 = NULL; |
|
str2 = xyStrings; |
|
fit1 = coalesceFit(str0, str2); |
|
if (!rawOrder) { |
|
// look for best-fitting string |
|
space = str0->yMax - str0->yMin; |
|
for (str3 = xyStrings, str4 = xyStrings->next; |
|
str4 && str4->xMin - str0->xMax <= space; |
|
str3 = str4, str4 = str4->next) { |
|
fit2 = coalesceFit(str0, str4); |
|
if (fit2 < fit1) { |
|
str1 = str3; |
|
str2 = str4; |
|
fit1 = fit2; |
|
} |
|
} |
|
} |
|
if (fit1 > 1) { |
|
// no fit - we're done with this block |
|
break; |
|
} |
|
|
|
// if we've hit a column edge we're done with this block |
|
if (fit1 > 0.2) { |
|
for (i = 0; i < edgesLength; ++i) { |
|
if (str0->xMax < edges[i].x + 0.5 && edges[i].x - 0.5 < str2->xMin && |
|
str0->yMin < edges[i].y1 && str0->yMax > edges[i].y0 && |
|
str2->yMin < edges[i].y1 && str2->yMax > edges[i].y0) { |
|
break; |
|
} |
|
} |
|
if (i < edgesLength) { |
|
break; |
|
} |
|
} |
|
|
|
if (str1) { |
|
str1->next = str2->next; |
|
} else { |
|
xyStrings = str2->next; |
|
} |
|
str0->next = str2; |
|
str2->next = NULL; |
|
if (str2->xMax > blk->xMax) { |
|
blk->xMax = str2->xMax; |
|
} |
|
if (str2->yMin < blk->yMin) { |
|
blk->yMin = str2->yMin; |
|
} |
|
if (str2->yMax > blk->yMax) { |
|
blk->yMax = str2->yMax; |
|
} |
|
str0 = str2; |
|
} |
|
|
|
// insert block on list |
|
if (!rawOrder) { |
|
// insert block on list in yx order |
|
for (blk1 = NULL, blk2 = yxBlocks; |
|
blk2 && !yxBefore(blk, blk2); |
|
blk1 = blk2, blk2 = blk2->next) ; |
|
} |
|
blk->next = blk2; |
|
if (blk1) { |
|
blk1->next = blk; |
|
} else { |
|
yxBlocks = blk; |
|
} |
|
blk1 = blk; |
|
} |
|
|
|
gfree(edges); |
|
|
|
// the strings are now owned by the lines/blocks tree |
|
xyStrings = NULL; |
|
|
|
// build the block text |
|
uMap = globalParams->getTextEncoding(); |
|
isUnicode = uMap ? uMap->isUnicode() : gFalse; |
|
for (blk = yxBlocks; blk; blk = blk->next) { |
|
blk->len = 0; |
|
for (str1 = blk->strings; str1; str1 = str1->next) { |
|
blk->len += str1->len; |
|
if (str1->next && str1->next->xMin - str1->xMax > |
|
textOutSpace * (str1->yMax - str1->yMin)) { |
|
str1->spaceAfter = gTrue; |
|
++blk->len; |
|
} else { |
|
str1->spaceAfter = gFalse; |
|
} |
|
} |
|
blk->text = (Unicode *)gmalloc(blk->len * sizeof(Unicode)); |
|
blk->xRight = (double *)gmalloc(blk->len * sizeof(double)); |
|
blk->col = (int *)gmalloc(blk->len * sizeof(int)); |
|
i = 0; |
|
for (str1 = blk->strings; str1; str1 = str1->next) { |
|
for (j = 0; j < str1->len; ++j) { |
|
blk->text[i] = str1->text[j]; |
|
blk->xRight[i] = str1->xRight[j]; |
|
++i; |
|
} |
|
if (str1->spaceAfter) { |
|
blk->text[i] = (Unicode)0x0020; |
|
blk->xRight[i] = str1->next->xMin; |
|
++i; |
|
} |
|
} |
|
blk->convertedLen = 0; |
|
for (j = 0; j < blk->len; ++j) { |
|
blk->col[j] = blk->convertedLen; |
|
if (isUnicode) { |
|
++blk->convertedLen; |
|
} else if (uMap) { |
|
blk->convertedLen += uMap->mapUnicode(blk->text[j], buf, sizeof(buf)); |
|
} |
|
} |
|
} |
|
if (uMap) { |
|
uMap->decRefCnt(); |
|
} |
|
|
|
#if 0 //~ for debugging |
|
for (blk = yxBlocks; blk; blk = blk->next) { |
|
printf("[block: x=%.2f..%.2f y=%.2f..%.2f len=%d]\n", |
|
blk->xMin, blk->xMax, blk->yMin, blk->yMax, blk->len); |
|
TextString *str; |
|
for (str = blk->strings; str; str = str->next) { |
|
printf(" x=%.2f..%.2f y=%.2f..%.2f size=%.2f'", |
|
str->xMin, str->xMax, str->yMin, str->yMax, |
|
(str->yMax - str->yMin)); |
|
for (i = 0; i < str->len; ++i) { |
|
fputc(str->text[i] & 0xff, stdout); |
|
} |
|
if (str->spaceAfter) { |
|
fputc(' ', stdout); |
|
} |
|
printf("'\n"); |
|
} |
|
} |
|
printf("\n------------------------------------------------------------\n\n"); |
|
#endif |
|
|
|
// build the lines |
|
lines = NULL; |
|
line0 = NULL; |
|
while (yxBlocks) { |
|
blk0 = yxBlocks; |
|
yxBlocks = yxBlocks->next; |
|
blk0->next = NULL; |
|
line = new TextLine(); |
|
line->blocks = blk0; |
|
line->yMin = blk0->yMin; |
|
line->yMax = blk0->yMax; |
|
while (yxBlocks) { |
|
|
|
// remove duplicated text (fake boldface, shadowed text) |
|
h = blk0->yMax - blk0->yMin; |
|
if (yxBlocks->len == blk0->len && |
|
!memcmp(yxBlocks->text, blk0->text, |
|
yxBlocks->len * sizeof(Unicode)) && |
|
fabs(yxBlocks->yMin - blk0->yMin) / h < 0.2 && |
|
fabs(yxBlocks->yMax - blk0->yMax) / h < 0.2 && |
|
fabs(yxBlocks->xMin - blk0->xMin) / h < 0.2 && |
|
fabs(yxBlocks->xMax - blk0->xMax) / h < 0.2) { |
|
blk1 = yxBlocks; |
|
yxBlocks = yxBlocks->next; |
|
delete blk1; |
|
continue; |
|
} |
|
|
|
if (rawOrder && yxBlocks->yMax < blk0->yMin) { |
|
break; |
|
} |
|
if (yxBlocks->yMin > 0.2*blk0->yMin + 0.8*blk0->yMax || |
|
yxBlocks->xMin < blk0->xMax) { |
|
break; |
|
} |
|
blk1 = yxBlocks; |
|
yxBlocks = yxBlocks->next; |
|
blk0->next = blk1; |
|
blk1->next = NULL; |
|
if (blk1->yMin < line->yMin) { |
|
line->yMin = blk1->yMin; |
|
} |
|
if (blk1->yMax > line->yMax) { |
|
line->yMax = blk1->yMax; |
|
} |
|
blk0 = blk1; |
|
} |
|
if (line0) { |
|
line0->next = line; |
|
} else { |
|
lines = line; |
|
} |
|
line->next = NULL; |
|
line0 = line; |
|
} |
|
|
|
|
|
// sort the blocks into xy order |
|
xyBlocks = NULL; |
|
for (line = lines; line; line = line->next) { |
|
for (blk = line->blocks; blk; blk = blk->next) { |
|
for (blk1 = NULL, blk2 = xyBlocks; |
|
blk2 && !xyBefore(blk, blk2); |
|
blk1 = blk2, blk2 = blk2->xyNext) ; |
|
blk->xyNext = blk2; |
|
if (blk1) { |
|
blk1->xyNext = blk; |
|
} else { |
|
xyBlocks = blk; |
|
} |
|
} |
|
} |
|
|
|
#if 0 //~ for debugging |
|
for (blk = xyBlocks; blk; blk = blk->xyNext) { |
|
printf("[block: x=%.2f..%.2f y=%.2f..%.2f len=%d]\n", |
|
blk->xMin, blk->xMax, blk->yMin, blk->yMax, blk->len); |
|
TextString *str; |
|
for (str = blk->strings; str; str = str->next) { |
|
printf(" x=%.2f..%.2f y=%.2f..%.2f size=%.2f '", |
|
str->xMin, str->xMax, str->yMin, str->yMax, |
|
(str->yMax - str->yMin)); |
|
for (i = 0; i < str->len; ++i) { |
|
fputc(str->text[i] & 0xff, stdout); |
|
} |
|
printf("'\n"); |
|
} |
|
} |
|
printf("\n------------------------------------------------------------\n\n"); |
|
#endif |
|
|
|
// do column assignment |
|
for (blk1 = xyBlocks; blk1; blk1 = blk1->xyNext) { |
|
col1 = 0; |
|
for (blk2 = xyBlocks; blk2 != blk1; blk2 = blk2->xyNext) { |
|
if (blk1->xMin >= blk2->xMax) { |
|
d = (int)((blk1->xMin - blk2->xMax) / |
|
(0.4 * (blk1->yMax - blk1->yMin))); |
|
if (d > 4) { |
|
d = 4; |
|
} |
|
col2 = blk2->col[0] + blk2->convertedLen + d; |
|
if (col2 > col1) { |
|
col1 = col2; |
|
} |
|
} else if (blk1->xMin > blk2->xMin) { |
|
for (i = 0; i < blk2->len && blk1->xMin >= blk2->xRight[i]; ++i) ; |
|
col2 = blk2->col[i]; |
|
if (col2 > col1) { |
|
col1 = col2; |
|
} |
|
} |
|
} |
|
for (j = 0; j < blk1->len; ++j) { |
|
blk1->col[j] += col1; |
|
} |
|
} |
|
|
|
#if 0 //~ for debugging |
|
for (line = lines; line; line = line->next) { |
|
printf("[line]\n"); |
|
for (blk = line->blocks; blk; blk = blk->next) { |
|
printf("[block: col=%d, len=%d]\n", blk->col[0], blk->len); |
|
TextString *str; |
|
for (str = blk->strings; str; str = str->next) { |
|
printf(" x=%.2f..%.2f y=%.2f..%.2f size=%.2f '", |
|
str->xMin, str->xMax, str->yMin, str->yMax, |
|
(str->yMax - str->yMin)); |
|
for (i = 0; i < str->len; ++i) { |
|
fputc(str->text[i] & 0xff, stdout); |
|
} |
|
if (str->spaceAfter) { |
|
printf(" [space]\n"); |
|
} |
|
printf("'\n"); |
|
} |
|
} |
|
} |
|
printf("\n------------------------------------------------------------\n\n"); |
|
#endif |
|
} |
|
|
|
|
|
GBool TextPage::findText(Unicode *s, int len, |
|
GBool top, GBool bottom, |
|
double *xMin, double *yMin, |
|
double *xMax, double *yMax) { |
|
TextLine *line; |
|
TextBlock *blk; |
|
Unicode *p; |
|
Unicode u1, u2; |
|
int m, i, j; |
|
double x0, x1, x; |
|
|
|
// scan all blocks on page |
|
for (line = lines; line; line = line->next) { |
|
for (blk = line->blocks; blk; blk = blk->next) { |
|
|
|
// check: above top limit? |
|
if (!top && (blk->yMax < *yMin || |
|
(blk->yMin < *yMin && blk->xMax <= *xMin))) { |
|
continue; |
|
} |
|
|
|
// check: below bottom limit? |
|
if (!bottom && (blk->yMin > *yMax || |
|
(blk->yMax > *yMax && blk->xMin >= *xMax))) { |
|
return gFalse; |
|
} |
|
|
|
// search each position in this block |
|
m = blk->len; |
|
for (i = 0, p = blk->text; i <= m - len; ++i, ++p) { |
|
|
|
x0 = (i == 0) ? blk->xMin : blk->xRight[i-1]; |
|
x1 = blk->xRight[i]; |
|
x = 0.5 * (x0 + x1); |
|
|
|
// check: above top limit? |
|
if (!top && blk->yMin < *yMin) { |
|
if (x < *xMin) { |
|
continue; |
|
} |
|
} |
|
|
|
// check: below bottom limit? |
|
if (!bottom && blk->yMax > *yMax) { |
|
if (x > *xMax) { |
|
return gFalse; |
|
} |
|
} |
|
|
|
// compare the strings |
|
for (j = 0; j < len; ++j) { |
|
#if 1 //~ this lowercases Latin A-Z only -- this will eventually be |
|
//~ extended to handle other character sets |
|
if (p[j] >= 0x41 && p[j] <= 0x5a) { |
|
u1 = p[j] + 0x20; |
|
} else { |
|
u1 = p[j]; |
|
} |
|
if (s[j] >= 0x41 && s[j] <= 0x5a) { |
|
u2 = s[j] + 0x20; |
|
} else { |
|
u2 = s[j]; |
|
} |
|
#endif |
|
if (u1 != u2) { |
|
break; |
|
} |
|
} |
|
|
|
// found it |
|
if (j == len) { |
|
*xMin = x0; |
|
*xMax = blk->xRight[i + len - 1]; |
|
*yMin = blk->yMin; |
|
*yMax = blk->yMax; |
|
return gTrue; |
|
} |
|
} |
|
} |
|
} |
|
|
|
return gFalse; |
|
} |
|
|
|
GString *TextPage::getText(double xMin, double yMin, |
|
double xMax, double yMax) { |
|
GString *s; |
|
UnicodeMap *uMap; |
|
GBool isUnicode; |
|
char space[8], eol[16], buf[8]; |
|
int spaceLen, eolLen, len; |
|
TextLine *line; |
|
TextBlock *blk; |
|
double x0, x1, y; |
|
int firstCol, col, i; |
|
GBool multiLine; |
|
|
|
s = new GString(); |
|
|
|
// get the output encoding |
|
if (!(uMap = globalParams->getTextEncoding())) { |
|
return s; |
|
} |
|
isUnicode = uMap->isUnicode(); |
|
spaceLen = uMap->mapUnicode(0x20, space, sizeof(space)); |
|
eolLen = 0; // make gcc happy |
|
switch (globalParams->getTextEOL()) { |
|
case eolUnix: |
|
eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol)); |
|
break; |
|
case eolDOS: |
|
eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol)); |
|
eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen); |
|
break; |
|
case eolMac: |
|
eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol)); |
|
break; |
|
} |
|
|
|
// find the leftmost column |
|
multiLine = gFalse; |
|
firstCol = -1; |
|
for (line = lines; line; line = line->next) { |
|
if (line->yMin > yMax) { |
|
break; |
|
} |
|
if (line->yMax < yMin) { |
|
continue; |
|
} |
|
|
|
for (blk = line->blocks; blk && blk->xMax < xMin; blk = blk->next) ; |
|
if (!blk || blk->xMin > xMax) { |
|
continue; |
|
} |
|
|
|
y = 0.5 * (blk->yMin + blk->yMax); |
|
if (y < yMin || y > yMax) { |
|
continue; |
|
} |
|
|
|
if (firstCol >= 0) { |
|
multiLine = gTrue; |
|
} |
|
|
|
i = 0; |
|
while (1) { |
|
x0 = (i==0) ? blk->xMin : blk->xRight[i-1]; |
|
x1 = blk->xRight[i]; |
|
if (0.5 * (x0 + x1) > xMin) { |
|
break; |
|
} |
|
++i; |
|
} |
|
col = blk->col[i]; |
|
|
|
if (firstCol < 0 || col < firstCol) { |
|
firstCol = col; |
|
} |
|
} |
|
|
|
// extract the text |
|
for (line = lines; line; line = line->next) { |
|
if (line->yMin > yMax) { |
|
break; |
|
} |
|
if (line->yMax < yMin) { |
|
continue; |
|
} |
|
|
|
for (blk = line->blocks; blk && blk->xMax < xMin; blk = blk->next) ; |
|
if (!blk || blk->xMin > xMax) { |
|
continue; |
|
} |
|
|
|
y = 0.5 * (blk->yMin + blk->yMax); |
|
if (y < yMin || y > yMax) { |
|
continue; |
|
} |
|
|
|
i = 0; |
|
while (1) { |
|
x0 = (i==0) ? blk->xMin : blk->xRight[i-1]; |
|
x1 = blk->xRight[i]; |
|
if (0.5 * (x0 + x1) > xMin) { |
|
break; |
|
} |
|
++i; |
|
} |
|
|
|
col = firstCol; |
|
|
|
do { |
|
|
|
// line this block up with the correct column |
|
for (; col < blk->col[i]; ++col) { |
|
s->append(space, spaceLen); |
|
} |
|
|
|
// print the block |
|
for (; i < blk->len; ++i) { |
|
|
|
x0 = (i==0) ? blk->xMin : blk->xRight[i-1]; |
|
x1 = blk->xRight[i]; |
|
if (0.5 * (x0 + x1) > xMax) { |
|
break; |
|
} |
|
|
|
len = uMap->mapUnicode(blk->text[i], buf, sizeof(buf)); |
|
s->append(buf, len); |
|
col += isUnicode ? 1 : len; |
|
} |
|
if (i < blk->len) { |
|
break; |
|
} |
|
|
|
// next block |
|
blk = blk->next; |
|
i = 0; |
|
|
|
} while (blk && blk->xMin < xMax); |
|
|
|
if (multiLine) { |
|
s->append(eol, eolLen); |
|
} |
|
} |
|
|
|
uMap->decRefCnt(); |
|
|
|
return s; |
|
} |
|
|
|
void TextPage::dump(void *outputStream, TextOutputFunc outputFunc) { |
|
UnicodeMap *uMap; |
|
char space[8], eol[16], eop[8], buf[8]; |
|
int spaceLen, eolLen, eopLen, len; |
|
TextLine *line; |
|
TextBlock *blk; |
|
int col, d, i; |
|
|
|
// get the output encoding |
|
if (!(uMap = globalParams->getTextEncoding())) { |
|
return; |
|
} |
|
spaceLen = uMap->mapUnicode(0x20, space, sizeof(space)); |
|
eolLen = 0; // make gcc happy |
|
switch (globalParams->getTextEOL()) { |
|
case eolUnix: |
|
eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol)); |
|
break; |
|
case eolDOS: |
|
eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol)); |
|
eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen); |
|
break; |
|
case eolMac: |
|
eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol)); |
|
break; |
|
} |
|
eopLen = uMap->mapUnicode(0x0c, eop, sizeof(eop)); |
|
|
|
// output |
|
for (line = lines; line; line = line->next) { |
|
col = 0; |
|
for (blk = line->blocks; blk; blk = blk->next) { |
|
|
|
// line this block up with the correct column |
|
if (rawOrder && col == 0) { |
|
col = blk->col[0]; |
|
} else { |
|
for (; col < blk->col[0]; ++col) { |
|
(*outputFunc)(outputStream, space, spaceLen); |
|
} |
|
} |
|
|
|
// print the block |
|
for (i = 0; i < blk->len; ++i) { |
|
len = uMap->mapUnicode(blk->text[i], buf, sizeof(buf)); |
|
(*outputFunc)(outputStream, buf, len); |
|
} |
|
col += blk->convertedLen; |
|
} |
|
|
|
// print a return |
|
(*outputFunc)(outputStream, eol, eolLen); |
|
|
|
// print extra vertical space if necessary |
|
if (line->next) { |
|
d = (int)((line->next->yMin - line->yMax) / |
|
(line->blocks->strings->yMax - lines->blocks->strings->yMin) |
|
+ 0.5); |
|
// various things (weird font matrices) can result in bogus |
|
// values here, so do a sanity check |
|
if (rawOrder && d > 2) { |
|
d = 2; |
|
} else if (!rawOrder && d > 5) { |
|
d = 5; |
|
} |
|
for (; d > 0; --d) { |
|
(*outputFunc)(outputStream, eol, eolLen); |
|
} |
|
} |
|
} |
|
|
|
// end of page |
|
(*outputFunc)(outputStream, eol, eolLen); |
|
(*outputFunc)(outputStream, eop, eopLen); |
|
(*outputFunc)(outputStream, eol, eolLen); |
|
|
|
uMap->decRefCnt(); |
|
} |
|
|
|
// Returns true if <str1> should be inserted before <str2> in xy |
|
// order. |
|
GBool TextPage::xyBefore(TextString *str1, TextString *str2) { |
|
return str1->xMin < str2->xMin || |
|
(str1->xMin == str2->xMin && str1->yMin < str2->yMin); |
|
} |
|
|
|
// Returns true if <blk1> should be inserted before <blk2> in xy |
|
// order. |
|
GBool TextPage::xyBefore(TextBlock *blk1, TextBlock *blk2) { |
|
return blk1->xMin < blk2->xMin || |
|
(blk1->xMin == blk2->xMin && blk1->yMin < blk2->yMin); |
|
} |
|
|
|
// Returns true if <blk1> should be inserted before <blk2> in yx |
|
// order, allowing a little slack for vertically overlapping text. |
|
GBool TextPage::yxBefore(TextBlock *blk1, TextBlock *blk2) { |
|
double h1, h2, overlap; |
|
|
|
h1 = blk1->yMax - blk1->yMin; |
|
h2 = blk2->yMax - blk2->yMin; |
|
overlap = ((blk1->yMax < blk2->yMax ? blk1->yMax : blk2->yMax) - |
|
(blk1->yMin > blk2->yMin ? blk1->yMin : blk2->yMin)) / |
|
(h1 < h2 ? h1 : h2); |
|
if (overlap > 0.6) { |
|
return blk1->xMin < blk2->xMin; |
|
} |
|
return blk1->yMin < blk2->yMin; |
|
} |
|
|
|
double TextPage::coalesceFit(TextString *str1, TextString *str2) { |
|
double h1, h2, w1, w2, r, overlap, spacing; |
|
|
|
h1 = str1->yMax - str1->yMin; |
|
h2 = str2->yMax - str2->yMin; |
|
w1 = str1->xMax - str1->xMin; |
|
w2 = str2->xMax - str2->xMin; |
|
r = h1 / h2; |
|
if (r < (1.0 / 3.0) || r > 3) { |
|
return 10; |
|
} |
|
overlap = ((str1->yMax < str2->yMax ? str1->yMax : str2->yMax) - |
|
(str1->yMin > str2->yMin ? str1->yMin : str2->yMin)) / |
|
(h1 < h2 ? h1 : h2); |
|
if (overlap < 0.5) { |
|
return 10; |
|
} |
|
spacing = (str2->xMin - str1->xMax) / (h1 > h2 ? h1 : h2); |
|
if (spacing < -0.5) { |
|
return 10; |
|
} |
|
// separate text that overlaps - duplicated text (so that fake |
|
// boldface and shadowed text can be cleanly removed) |
|
if ((str2->xMin - str1->xMax) / (w1 < w2 ? w1 : w2) < -0.7) { |
|
return 10; |
|
} |
|
return spacing; |
|
} |
|
|
|
void TextPage::clear() { |
|
TextLine *p1, *p2; |
|
TextString *s1, *s2; |
|
|
|
if (curStr) { |
|
delete curStr; |
|
curStr = NULL; |
|
} |
|
if (lines) { |
|
for (p1 = lines; p1; p1 = p2) { |
|
p2 = p1->next; |
|
delete p1; |
|
} |
|
} else if (xyStrings) { |
|
for (s1 = xyStrings; s1; s1 = s2) { |
|
s2 = s1->next; |
|
delete s1; |
|
} |
|
} |
|
xyStrings = NULL; |
|
xyCur1 = xyCur2 = NULL; |
|
lines = NULL; |
|
nest = 0; |
|
nTinyChars = 0; |
|
} |
|
|
|
//------------------------------------------------------------------------ |
|
// TextOutputDev |
|
//------------------------------------------------------------------------ |
|
|
|
static void outputToFile(void *stream, char *text, int len) { |
|
fwrite(text, 1, len, (FILE *)stream); |
|
} |
|
|
|
TextOutputDev::TextOutputDev(char *fileName, GBool rawOrderA, GBool append) { |
|
text = NULL; |
|
rawOrder = rawOrderA; |
|
ok = gTrue; |
|
|
|
// open file |
|
needClose = gFalse; |
|
if (fileName) { |
|
if (!strcmp(fileName, "-")) { |
|
outputStream = stdout; |
|
} else if ((outputStream = fopen(fileName, append ? "ab" : "wb"))) { |
|
needClose = gTrue; |
|
} else { |
|
error(-1, "Couldn't open text file '%s'", fileName); |
|
ok = gFalse; |
|
return; |
|
} |
|
outputFunc = &outputToFile; |
|
} else { |
|
outputStream = NULL; |
|
} |
|
|
|
// set up text object |
|
text = new TextPage(rawOrder); |
|
} |
|
|
|
TextOutputDev::TextOutputDev(TextOutputFunc func, void *stream, |
|
GBool rawOrderA) { |
|
outputFunc = func; |
|
outputStream = stream; |
|
needClose = gFalse; |
|
rawOrder = rawOrderA; |
|
text = new TextPage(rawOrder); |
|
ok = gTrue; |
|
} |
|
|
|
TextOutputDev::~TextOutputDev() { |
|
if (needClose) { |
|
#ifdef MACOS |
|
ICS_MapRefNumAndAssign((short)((FILE *)outputStream)->handle); |
|
#endif |
|
fclose((FILE *)outputStream); |
|
} |
|
if (text) { |
|
delete text; |
|
} |
|
} |
|
|
|
void TextOutputDev::startPage(int pageNum, GfxState *state) { |
|
text->clear(); |
|
} |
|
|
|
void TextOutputDev::endPage() { |
|
text->coalesce(); |
|
if (outputStream) { |
|
text->dump(outputStream, outputFunc); |
|
} |
|
} |
|
|
|
void TextOutputDev::updateFont(GfxState *state) { |
|
text->updateFont(state); |
|
} |
|
|
|
void TextOutputDev::beginString(GfxState *state, GString *s) { |
|
text->beginString(state, state->getCurX(), state->getCurY()); |
|
} |
|
|
|
void TextOutputDev::endString(GfxState *state) { |
|
text->endString(); |
|
} |
|
|
|
void TextOutputDev::drawChar(GfxState *state, double x, double y, |
|
double dx, double dy, |
|
double originX, double originY, |
|
CharCode c, Unicode *u, int uLen) { |
|
text->addChar(state, x, y, dx, dy, u, uLen); |
|
} |
|
|
|
GBool TextOutputDev::findText(Unicode *s, int len, |
|
GBool top, GBool bottom, |
|
double *xMin, double *yMin, |
|
double *xMax, double *yMax) { |
|
return text->findText(s, len, top, bottom, xMin, yMin, xMax, yMax); |
|
} |
|
|
|
GString *TextOutputDev::getText(double xMin, double yMin, |
|
double xMax, double yMax) { |
|
return text->getText(xMin, yMin, xMax, yMax); |
|
} |
|
|
|
|