#undef HAVE_STAT
#include "../devices/swf.h"
#include "../devices/render.h"
+#include "../devices/ocr.h"
#include "../devices/rescale.h"
#include "../devices/text.h"
#include "../pdf/pdf.h"
+#include "../readers/swf.h"
+#include "../readers/image.h"
#include "../log.h"
#include "../utf8.h"
-gfxsource_t*pdfdriver;
+static gfxsource_t*pdfdriver = 0;
+static gfxsource_t*swfdriver = 0;
+static gfxsource_t*imagedriver = 0;
staticforward PyTypeObject OutputClass;
staticforward PyTypeObject PageClass;
self->output_device->endpage(self->output_device);
return PY_NONE;
}
-PyDoc_STRVAR(output_setParameter_doc, \
-"setParameter(key, value)\n\n"
+PyDoc_STRVAR(output_setparameter_doc, \
+"setparameter(key, value)\n\n"
"Set a output-device dependent parameter"
);
-static PyObject* output_setParameter(PyObject* _self, PyObject* args, PyObject* kwargs)
+static PyObject* output_setparameter(PyObject* _self, PyObject* args, PyObject* kwargs)
{
OutputObject* self = (OutputObject*)_self;
static char *kwlist[] = {"key", "value", NULL};
return (PyObject*)self;
}
+PyDoc_STRVAR(f_createOCR_doc, \
+"OCR()\n\n"
+"Creates a device which processes documents using OCR (optical\n"
+"character recognition).\n"
+"This is handy for e.g. extracting fulltext from PDF documents\n"
+"which have broken fonts, and where hence the \"PlainText\"\n"
+"device doesn't work.\n"
+);
+static PyObject* f_createOCR(PyObject* parent, PyObject* args, PyObject* kwargs)
+{
+ static char *kwlist[] = {NULL};
+ if (args && !PyArg_ParseTupleAndKeywords(args, kwargs, "", kwlist))
+ return NULL;
+ OutputObject*self = PyObject_New(OutputObject, &OutputClass);
+
+ self->output_device = malloc(sizeof(gfxdevice_t));
+ gfxdevice_ocr_init(self->output_device);
+ return (PyObject*)self;
+}
+
+
PyDoc_STRVAR(f_createImageList_doc, \
"ImageList()\n\n"
"Creates a device which renders documents to bitmaps.\n"
{"save", (PyCFunction)output_save, METH_KEYWORDS, output_save_doc},
{"startpage", (PyCFunction)output_startpage, METH_KEYWORDS, output_startpage_doc},
{"endpage", (PyCFunction)output_endpage, METH_KEYWORDS, output_endpage_doc},
- {"setParameter", (PyCFunction)output_setParameter, METH_KEYWORDS, output_setParameter_doc},
+ {"setparameter", (PyCFunction)output_setparameter, METH_KEYWORDS, output_setparameter_doc},
{0,0,0,0}
};
return PyString_FromString(s);
}
-PyDoc_STRVAR(doc_setParameter_doc,
-"setParameter(key, value)\n\n"
+PyDoc_STRVAR(doc_setparameter_doc,
+"setparameter(key, value)\n\n"
"Pass a parameter or setting to the document parser. Unlike\n"
"the module level setparameter() function, the parameters set\n"
-"using setParameter will only be valid for the object itself\n"
+"using setparameter will only be valid for the object itself\n"
"during its lifetime.\n"
);
-static PyObject* doc_setParameter(PyObject* _self, PyObject* args, PyObject* kwargs)
+static PyObject* doc_setparameter(PyObject* _self, PyObject* args, PyObject* kwargs)
{
DocObject* self = (DocObject*)_self;
PyDoc_STRVAR(f_open_doc,
"open(type, filename) -> object\n\n"
-"Open a PDF file. The type argument always has to be \"pdf\"\n"
-"It returns a doc object which can be used to process the pdf\n"
-"contents. E.g.\n"
+"Open a PDF, SWF or image file. The type argument should be \"pdf\",\n"
+"\"swf\" or \"image\" accordingly. It returns a doc object which can be\n"
+"used to process the file contents.\n"
+"E.g.\n"
" doc = open(\"pdf\", \"document.pdf\")\n"
-"If the file is not a PDF file or is encrypted without\n"
+" doc = open(\"swf\", \"flashfile.swf\")\n"
+" doc = open(\"image\", \"image.png\")\n"
+"If the file could not be loaded, or is a encrypted PDF file without\n"
"a proper password specified, an exception is being raised.\n"
"If the filename argument contains a '|' char, everything behind\n"
"the '|' is treated as password used for opening the file.\n"
"E.g.\n"
" doc = open(\"pdf\", \"document.pdf|mysecretpassword\")\n"
+".\n"
+"Notice that for image files, the only supported file formats right now\n"
+"are jpeg and png.\n"
);
static PyObject* f_open(PyObject* parent, PyObject* args, PyObject* kwargs)
{
static char *kwlist[] = {"type", "filename", NULL};
- char*filename;
- char*type;
+ char*filename=0;
+ char*type=0;
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "ss", kwlist, &type, &filename)) {
static char *kwlist2[] = {"filename", NULL};
- type = "pdf";
+ type = 0;
PyErr_Clear();
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s", kwlist2, &filename))
return NULL;
}
DocObject*self = PyObject_New(DocObject, &DocClass);
+
+ if(!type) { //autodetect
+ type = "pdf"; //default
+ int l = strlen(filename);
+ if(l>4) {
+ if(filename[l-4]=='.') {
+ if(strchr("pP", filename[l-3]) && strchr("dD", filename[l-2]) && strchr("fF", filename[l-1]))
+ type = "pdf";
+ if(strchr("jJ", filename[l-3]) && strchr("pP", filename[l-2]) && strchr("gG", filename[l-1]))
+ type = "image";
+ if(strchr("pP", filename[l-3]) && strchr("nN", filename[l-2]) && strchr("gG", filename[l-1]))
+ type = "image";
+ if(strchr("sS", filename[l-3]) && strchr("wW", filename[l-2]) && strchr("fF", filename[l-1]))
+ type = "swf";
+ } else if(filename[l-5]=='.') {
+ type = "image";
+ }
+ }
+ }
if(!strcmp(type,"pdf"))
self->doc = pdfdriver->open(pdfdriver,filename);
+ else if(!strcmp(type, "image") || !strcmp(type, "img"))
+ self->doc = imagedriver->open(imagedriver, filename);
+ else if(!strcmp(type, "swf") || !strcmp(type, "SWF"))
+ self->doc = swfdriver->open(imagedriver, filename);
else
return PY_ERROR("Unknown type %s", type);
/* PDF functions */
{"getPage", (PyCFunction)doc_getPage, METH_KEYWORDS, doc_getPage_doc},
{"getInfo", (PyCFunction)doc_getInfo, METH_KEYWORDS, doc_getInfo_doc},
- {"setParameter", (PyCFunction)doc_setParameter, METH_KEYWORDS, doc_setParameter_doc},
+ {"setparameter", (PyCFunction)doc_setparameter, METH_KEYWORDS, doc_setparameter_doc},
{0,0,0,0}
};
"A Doc object is used for storing a document (like a PDF).\n"
"doc.pages contains the number of pages in the document,\n"
"and doc.filename the name of the file the document was\n"
-"created (loaded) from\n"
+"created (loaded) from. If the document was created from\n"
+"an image file, the number of pages is always 1\n"
);
static PyTypeObject DocClass =
{
{"open", (PyCFunction)f_open, METH_KEYWORDS, f_open_doc},
{"addfont", (PyCFunction)f_addfont, METH_KEYWORDS, f_addfont_doc},
{"addfontdir", (PyCFunction)f_addfontdir, METH_KEYWORDS, f_addfontdir_doc},
- {"setoption", (PyCFunction)f_setparameter, METH_KEYWORDS, f_setparameter_doc}, // for backwards-compatibility
{"setparameter", (PyCFunction)f_setparameter, METH_KEYWORDS, f_setparameter_doc},
{"verbose", (PyCFunction)f_verbose, METH_KEYWORDS, f_verbose_doc},
/* devices */
{"SWF", (PyCFunction)f_createSWF, METH_KEYWORDS, f_createSWF_doc},
+ {"OCR", (PyCFunction)f_createOCR, METH_KEYWORDS, f_createOCR_doc},
{"ImageList", (PyCFunction)f_createImageList, METH_KEYWORDS, f_createImageList_doc},
{"PlainText", (PyCFunction)f_createPlainText, METH_KEYWORDS, f_createPlainText_doc},
{"PassThrough", (PyCFunction)f_createPassThrough, METH_KEYWORDS, f_createPassThrough_doc},
"The latter functionality is similar to what is offered by swftools'\n"
"(http://www.swftools.org) pdf2swf utility, however more powerful-\n"
"You can also create individual SWF files from single pages of the PDF\n"
-"or combine more than one page into a bigger PDF.\n"
+"or mix pages from different PDF files.\n"
);
void initgfx(void)
DocClass.ob_type = &PyType_Type;
pdfdriver = gfxsource_pdf_create();
+ swfdriver = gfxsource_swf_create();
+ imagedriver = gfxsource_image_create();
PyObject*module = Py_InitModule3("gfx", pdf2swf_methods, gfx_doc);
PyObject*module_dict = PyModule_GetDict(module);