Author: zagy
Date: Tue Sep 27 13:24:41 2005
New Revision: 3605
Modified:
glome/trunk/Extensions/Install.py
glome/trunk/transforms/pdf_to_html.py
Log:
- pdf_to_html handles images now
- auto-registering pdf_to-html and ppt transforms on install
Modified: glome/trunk/Extensions/Install.py
==============================================================================
--- glome/trunk/Extensions/Install.py (original)
+++ glome/trunk/Extensions/Install.py Tue Sep 27 13:24:41 2005
(at)(at) -352,6 +352,22 (at)(at)
cat.manage_delIndex(["modified"])
cat.manage_addIndex("modified", "DateIndex")
cat.reindexIndex('modified', self.REQUEST)
+
+def configure_transforms(self, out):
+ pt = getToolByName(self, 'portal_transforms')
+ transforms = pt.objectIds()
+ if 'excel_to_html' not in transforms:
+ pt.manage_addTransform('excel_to_html',
+ 'Products.glome.transforms.excel_to_html')
+ if 'ppt_to_html' not in transforms:
+ pt.manage_addTransform('ppt_to_html',
+ 'Products.glome.transforms.ppt_to_html')
+
+ if 'pdf_to_html' in transforms:
+ pt.manage_delObjects(['pdf_to_html'])
+ pt.manage_addTransform('pdf_to_html',
+ 'Products.glome.transforms.pdf_to_html')
+
def install(self):
(at)(at) -400,7 +416,11 (at)(at)
print >>out, 'Configuring catalogs'
configure_catalog(self, out)
-
+
+ print >>out, 'Adding transforms ...'
+ configure_transforms(self, out)
+
+
out.write("Successfully installed %s." % config.PROJECTNAME)
return out.getvalue()
Modified: glome/trunk/transforms/pdf_to_html.py
==============================================================================
--- glome/trunk/transforms/pdf_to_html.py (original)
+++ glome/trunk/transforms/pdf_to_html.py Tue Sep 27 13:24:41 2005
(at)(at) -8,17 +8,12 (at)(at)
import re
import tempfile
-# CMF imports
-from Products.CMFDefault.utils import bodyfinder
-
-# Sibling imports
from Products.PortalTransforms.utils import log
from Products.PortalTransforms.interfaces import itransform
-from Products.PortalTransforms.libtransforms.utils import bin_search, sansext
+from Products.PortalTransforms.libtransforms.utils import \
+ bin_search, sansext, bodyfinder, scrubHTML
from Products.PortalTransforms.libtransforms.commandtransform import \
commandtransform
-from Products.PortalTransforms.libtransforms.commandtransform import \
- popentransform
ANCHOR_PATTERN = r'<[aA] href="%s(#[0-9]+)">'
ANCHOR_REPLACE = r'<a href="\1">'
(at)(at) -28,7 +23,7 (at)(at)
pattern = ANCHOR_PATTERN % re.escape(filename)
return re.sub(pattern, ANCHOR_REPLACE, html)
-class pdf_to_html(popentransform):
+class pdf_to_html(commandtransform):
__implements__ = itransform
__version__ = '2004-07-02.01'
(at)(at) -38,40 +33,44 (at)(at)
output = 'text/html'
output_encoding = 'utf-8'
- binaryName = "pdftohtml"
+ binaryame = "pdftohtml"
binaryArgs = "%(infile)s -noframes -stdout -enc UTF-8"
- useStdin = False
-
- def getData(self, couterr):
- return bodyfinder(couterr.read())
-
- def convert(self, data, cache, **kwargs):
- command = "%s %s" % (self.binary, self.binaryArgs)
- if not self.useStdin:
- tmpfile, tmpname = tempfile.mkstemp(text=False) # create tmp
- os.write(tmpfile, data) # write data to tmp using a file
descriptor
- os.close(tmpfile) # close it so the other process can read
it
- command = command % { 'infile' : tmpname } # apply tmp name to
command
-
- log('PortalTransforms: Calling %s' % command)
- cin, couterr = os.popen4(command, mode='b')
- if self.useStdin:
- cin.write(data)
+ def __init__(self, name=None, **kwargs):
+ commandtransform.__init__(self, name, **kwargs)
+ name = self.name()
+ if not name.endswith('.pdf'):
+ name = name + ".pdf"
- status = cin.close()
+ def convert(self, data, cache, **kwargs):
+ name = self.name()
+ tmpdir, fullname = self.initialize_tmpdir(data, filename=name)
+ target_name = '%s/%s.html' % (tmpdir, name)
- out = self.getData(couterr)
- out = cleanInternalLinks(out, tmpname)
- couterr.close()
-
- if not self.useStdin:
- # remove tmp file
- os.unlink(tmpname)
+ command = 'cd "%s" && pdftohtml -noframes -enc UTF-8 %s %s' %
(
+ tmpdir, fullname, target_name)
+ log('PortalTransforms: Calling %s' % command)
+ os.system(command)
+
+ html = self.html(target_name)
+ path, images = self.subObjects(tmpdir)
+ objects = {}
+ if images:
+ self.fixImages(path, images, objects)
+ self.cleanDir(tmpdir)
- cache.setData(out)
+ cache.setData(html)
+ cache.setSubObjects(objects)
return cache
+ def html(self, html_file_name):
+ htmlfile = file(html_file_name, 'r')
+ html = htmlfile.read()
+ htmlfile.close()
+ html = scrubHTML(html)
+ body = bodyfinder(html)
+ return body
+
def register():
return pdf_to_html()
|