mirror of
https://github.com/koreader/koreader.git
synced 2025-08-10 00:52:38 +00:00
add PDF text extraction (for pages)
this will return data in the way that djvu.c does already. hopefully, this will permit us to re-use the highlighting code (and factor it out into unireader.lua)
This commit is contained in:
120
pdf.c
120
pdf.c
@@ -311,6 +311,123 @@ static int openPage(lua_State *L) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* get the text of the given page
|
||||
*
|
||||
* will return text in a Lua table that is modeled after
|
||||
* djvu.c creates this table.
|
||||
*
|
||||
* note that the definition of "line" is somewhat arbitrary
|
||||
* here (for now)
|
||||
*
|
||||
* MuPDFs API provides text as single char information
|
||||
* that is collected in "spans". we use a span as a "line"
|
||||
* in Lua output and segment spans into words by looking
|
||||
* for space characters.
|
||||
*
|
||||
* will return an empty table if we have no text
|
||||
*/
|
||||
static int getPageText(lua_State *L) {
|
||||
fz_text_span *page_text;
|
||||
fz_text_span *ptr;
|
||||
fz_device *tdev;
|
||||
fz_bbox bbox, linebbox;
|
||||
fz_matrix ctm;
|
||||
int i;
|
||||
int word, line;
|
||||
int len, c;
|
||||
int start;
|
||||
char chars[4]; // max length of UTF-8 encoded rune
|
||||
luaL_Buffer textbuf;
|
||||
|
||||
PdfPage *page = (PdfPage*) luaL_checkudata(L, 1, "pdfpage");
|
||||
|
||||
/* returned coordinates are in centi-point (n * 0.01 pt) */
|
||||
ctm = fz_scale(100, 100);
|
||||
|
||||
page_text = fz_new_text_span(page->doc->context);
|
||||
tdev = fz_new_text_device(page->doc->context, page_text);
|
||||
fz_run_page(page->doc->xref, page->page, tdev, ctm, NULL);
|
||||
fz_free_device(tdev);
|
||||
|
||||
/* table that contains all the lines */
|
||||
lua_newtable(L);
|
||||
ptr = page_text;
|
||||
line = 1;
|
||||
while(ptr) {
|
||||
/* table for the words */
|
||||
lua_newtable(L);
|
||||
word = 1;
|
||||
linebbox = ptr->text[0].bbox; // start with sensible default
|
||||
for(i = 0; i < ptr->len; ) {
|
||||
/* will hold information about a word: */
|
||||
lua_newtable(L);
|
||||
|
||||
luaL_buffinit(L, &textbuf);
|
||||
bbox = ptr->text[i].bbox; // start with sensible default
|
||||
for(; i < ptr->len; i++) {
|
||||
/* check for space characters */
|
||||
if(ptr->text[i].c == ' ' ||
|
||||
ptr->text[i].c == '\t' ||
|
||||
ptr->text[i].c == '\n' ||
|
||||
ptr->text[i].c == '\v' ||
|
||||
ptr->text[i].c == '\f' ||
|
||||
ptr->text[i].c == '\r' ) {
|
||||
// ignore and end word
|
||||
i++;
|
||||
break;
|
||||
}
|
||||
len = runetochar(chars, &ptr->text[i].c);
|
||||
for(c = 0; c < len; c++) {
|
||||
luaL_addchar(&textbuf, chars[c]);
|
||||
}
|
||||
bbox = fz_union_bbox(bbox, ptr->text[i].bbox);
|
||||
linebbox = fz_union_bbox(linebbox, ptr->text[i].bbox);
|
||||
}
|
||||
lua_pushstring(L, "word");
|
||||
luaL_pushresult(&textbuf);
|
||||
lua_settable(L, -3);
|
||||
|
||||
/* bbox for a word: */
|
||||
lua_pushstring(L, "x0");
|
||||
lua_pushinteger(L, bbox.x0);
|
||||
lua_settable(L, -3);
|
||||
lua_pushstring(L, "y0");
|
||||
lua_pushinteger(L, bbox.y0);
|
||||
lua_settable(L, -3);
|
||||
lua_pushstring(L, "x1");
|
||||
lua_pushinteger(L, bbox.x1);
|
||||
lua_settable(L, -3);
|
||||
lua_pushstring(L, "y1");
|
||||
lua_pushinteger(L, bbox.y1);
|
||||
lua_settable(L, -3);
|
||||
|
||||
lua_rawseti(L, -2, word++);
|
||||
}
|
||||
|
||||
/* bbox for a whole line (or in fact, a "span") */
|
||||
lua_pushstring(L, "x0");
|
||||
lua_pushinteger(L, linebbox.x0);
|
||||
lua_settable(L, -3);
|
||||
lua_pushstring(L, "y0");
|
||||
lua_pushinteger(L, linebbox.y0);
|
||||
lua_settable(L, -3);
|
||||
lua_pushstring(L, "x1");
|
||||
lua_pushinteger(L, linebbox.x1);
|
||||
lua_settable(L, -3);
|
||||
lua_pushstring(L, "y1");
|
||||
lua_pushinteger(L, linebbox.y1);
|
||||
lua_settable(L, -3);
|
||||
|
||||
lua_rawseti(L, -2, line++);
|
||||
|
||||
ptr = ptr->next;
|
||||
}
|
||||
|
||||
fz_free_text_span(page->doc->context, page_text);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int getPageSize(lua_State *L) {
|
||||
fz_matrix ctm;
|
||||
fz_rect bounds;
|
||||
@@ -323,7 +440,7 @@ static int getPageSize(lua_State *L) {
|
||||
ctm = fz_concat(ctm, fz_rotate(dc->rotate));
|
||||
bbox = fz_transform_rect(ctm, bounds);
|
||||
|
||||
lua_pushnumber(L, bbox.x1-bbox.x0);
|
||||
lua_pushnumber(L, bbox.x1-bbox.x0);
|
||||
lua_pushnumber(L, bbox.y1-bbox.y0);
|
||||
|
||||
return 2;
|
||||
@@ -456,6 +573,7 @@ static const struct luaL_Reg pdfdocument_meth[] = {
|
||||
static const struct luaL_Reg pdfpage_meth[] = {
|
||||
{"getSize", getPageSize},
|
||||
{"getUsedBBox", getUsedBBox},
|
||||
{"getPageText", getPageText},
|
||||
{"close", closePage},
|
||||
{"__gc", closePage},
|
||||
{"draw", drawPage},
|
||||
|
||||
Reference in New Issue
Block a user