From 80a6e0210be92a5d9c0ba5299e876463e7e718de Mon Sep 17 00:00:00 2001 From: HW Date: Wed, 11 Apr 2012 21:12:59 +0200 Subject: [PATCH] add PDF text extraction (for pages) this will return data in the way that djvu.c does already. hopefully, this will permit us to re-use the highlighting code (and factor it out into unireader.lua) --- pdf.c | 120 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 119 insertions(+), 1 deletion(-) diff --git a/pdf.c b/pdf.c index 87190f14d..7b0a811bf 100644 --- a/pdf.c +++ b/pdf.c @@ -311,6 +311,123 @@ static int openPage(lua_State *L) { return 1; } +/* get the text of the given page + * + * will return text in a Lua table that is modeled after + * djvu.c creates this table. + * + * note that the definition of "line" is somewhat arbitrary + * here (for now) + * + * MuPDFs API provides text as single char information + * that is collected in "spans". we use a span as a "line" + * in Lua output and segment spans into words by looking + * for space characters. + * + * will return an empty table if we have no text + */ +static int getPageText(lua_State *L) { + fz_text_span *page_text; + fz_text_span *ptr; + fz_device *tdev; + fz_bbox bbox, linebbox; + fz_matrix ctm; + int i; + int word, line; + int len, c; + int start; + char chars[4]; // max length of UTF-8 encoded rune + luaL_Buffer textbuf; + + PdfPage *page = (PdfPage*) luaL_checkudata(L, 1, "pdfpage"); + + /* returned coordinates are in centi-point (n * 0.01 pt) */ + ctm = fz_scale(100, 100); + + page_text = fz_new_text_span(page->doc->context); + tdev = fz_new_text_device(page->doc->context, page_text); + fz_run_page(page->doc->xref, page->page, tdev, ctm, NULL); + fz_free_device(tdev); + + /* table that contains all the lines */ + lua_newtable(L); + ptr = page_text; + line = 1; + while(ptr) { + /* table for the words */ + lua_newtable(L); + word = 1; + linebbox = ptr->text[0].bbox; // start with sensible default + for(i = 0; i < ptr->len; ) { + /* will hold information about a word: */ + lua_newtable(L); + + luaL_buffinit(L, &textbuf); + bbox = ptr->text[i].bbox; // start with sensible default + for(; i < ptr->len; i++) { + /* check for space characters */ + if(ptr->text[i].c == ' ' || + ptr->text[i].c == '\t' || + ptr->text[i].c == '\n' || + ptr->text[i].c == '\v' || + ptr->text[i].c == '\f' || + ptr->text[i].c == '\r' ) { + // ignore and end word + i++; + break; + } + len = runetochar(chars, &ptr->text[i].c); + for(c = 0; c < len; c++) { + luaL_addchar(&textbuf, chars[c]); + } + bbox = fz_union_bbox(bbox, ptr->text[i].bbox); + linebbox = fz_union_bbox(linebbox, ptr->text[i].bbox); + } + lua_pushstring(L, "word"); + luaL_pushresult(&textbuf); + lua_settable(L, -3); + + /* bbox for a word: */ + lua_pushstring(L, "x0"); + lua_pushinteger(L, bbox.x0); + lua_settable(L, -3); + lua_pushstring(L, "y0"); + lua_pushinteger(L, bbox.y0); + lua_settable(L, -3); + lua_pushstring(L, "x1"); + lua_pushinteger(L, bbox.x1); + lua_settable(L, -3); + lua_pushstring(L, "y1"); + lua_pushinteger(L, bbox.y1); + lua_settable(L, -3); + + lua_rawseti(L, -2, word++); + } + + /* bbox for a whole line (or in fact, a "span") */ + lua_pushstring(L, "x0"); + lua_pushinteger(L, linebbox.x0); + lua_settable(L, -3); + lua_pushstring(L, "y0"); + lua_pushinteger(L, linebbox.y0); + lua_settable(L, -3); + lua_pushstring(L, "x1"); + lua_pushinteger(L, linebbox.x1); + lua_settable(L, -3); + lua_pushstring(L, "y1"); + lua_pushinteger(L, linebbox.y1); + lua_settable(L, -3); + + lua_rawseti(L, -2, line++); + + ptr = ptr->next; + } + + fz_free_text_span(page->doc->context, page_text); + + return 1; +} + static int getPageSize(lua_State *L) { fz_matrix ctm; fz_rect bounds; @@ -323,7 +440,7 @@ static int getPageSize(lua_State *L) { ctm = fz_concat(ctm, fz_rotate(dc->rotate)); bbox = fz_transform_rect(ctm, bounds); - lua_pushnumber(L, bbox.x1-bbox.x0); + lua_pushnumber(L, bbox.x1-bbox.x0); lua_pushnumber(L, bbox.y1-bbox.y0); return 2; @@ -456,6 +573,7 @@ static const struct luaL_Reg pdfdocument_meth[] = { static const struct luaL_Reg pdfpage_meth[] = { {"getSize", getPageSize}, {"getUsedBBox", getUsedBBox}, + {"getPageText", getPageText}, {"close", closePage}, {"__gc", closePage}, {"draw", drawPage},