addons/document/document_storage.py

   1 # -*- encoding: utf-8 -*-
   2 ##############################################################################
   3 #
   4 #    OpenERP, Open Source Management Solution
   5 #
   6 #    Copyright (C) P. Christeas, 2009, all rights reserved
   7 #
   8 #    This program is free software: you can redistribute it and/or modify
   9 #    it under the terms of the GNU General Public License as published by
  10 #    the Free Software Foundation, either version 3 of the License, or
  11 #    (at your option) any later version.
  12 #
  13 #    This program is distributed in the hope that it will be useful,
  14 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 #    GNU General Public License for more details.
  17 #
  18 #    You should have received a copy of the GNU General Public License
  19 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  20 #
  21 ##############################################################################
  22
  23 from osv import osv, fields
  24 import os
  25 import tools
  26 import base64
  27 import errno
  28 import logging
  29 from StringIO import StringIO
  30
  31 from tools.misc import ustr
  32 from tools.translate import _
  33
  34 from osv.orm import except_orm
  35
  36 import random
  37 import string
  38 import pooler
  39 import netsvc
  40 import nodes
  41 from content_index import cntIndex
  42
  43 DMS_ROOT_PATH = tools.config.get('document_path', os.path.join(tools.config.get('root_path'), 'filestore'))
  44
  45
  46 """ The algorithm of data storage
  47
  48 We have to consider 3 cases of data /retrieval/:
  49  Given (context,path) we need to access the file (aka. node).
  50  given (directory, context), we need one of its children (for listings, views)
  51  given (ir.attachment, context), we needs its data and metadata (node).
  52
  53 For data /storage/ we have the cases:
  54  Have (ir.attachment, context), we modify the file (save, update, rename etc).
  55  Have (directory, context), we create a file.
  56  Have (path, context), we create or modify a file.
  57
  58 Note that in all above cases, we don't explicitly choose the storage media,
  59 but always require a context to be present.
  60
  61 Note that a node will not always have a corresponding ir.attachment. Dynamic
  62 nodes, for once, won't. Their metadata will be computed by the parent storage
  63 media + directory.
  64
  65 The algorithm says that in any of the above cases, our first goal is to locate
  66 the node for any combination of search criteria. It would be wise NOT to
  67 represent each node in the path (like node[/] + node[/dir1] + node[/dir1/dir2])
  68 but directly jump to the end node (like node[/dir1/dir2]) whenever possible.
  69
  70 We also contain all the parenting loop code in one function. This is intentional,
  71 because one day this will be optimized in the db (Pg 8.4).
  72
  73
  74 """
  75
  76 def random_name():
  77     random.seed()
  78     d = [random.choice(string.ascii_letters) for x in xrange(10) ]
  79     name = "".join(d)
  80     return name
  81
  82 INVALID_CHARS = {'*':str(hash('*')), '|':str(hash('|')) , "\\":str(hash("\\")), '/':'__', ':':str(hash(':')), '"':str(hash('"')), '<':str(hash('<')) , '>':str(hash('>')) , '?':str(hash('?'))}
  83
  84
  85 def create_directory(path):
  86     dir_name = random_name()
  87     path = os.path.join(path, dir_name)
  88     os.makedirs(path)
  89     return dir_name
  90
  91 class nodefd_file(nodes.node_descriptor):
  92     """ A descriptor to a real file
  93
  94     Inheriting directly from file doesn't work, since file exports
  95     some read-only attributes (like 'name') that we don't like.
  96     """
  97     def __init__(self, parent, path, mode):
  98         nodes.node_descriptor.__init__(self, parent)
  99         self.__file = open(path, mode)
 100
 101         for attr in ('closed', 'read', 'write', 'seek', 'tell'):
 102             setattr(self,attr, getattr(self.__file, attr))
 103
 104     def close(self):
 105         # TODO: locking in init, close()
 106         self.__file.close()
 107
 108
 109 class nodefd_db(StringIO, nodes.node_descriptor):
 110     """ A descriptor to db data
 111     """
 112     def __init__(self, parent, ira_browse, mode):
 113         nodes.node_descriptor.__init__(self, parent)
 114         if mode.endswith('b'):
 115             mode = mode[:-1]
 116
 117         if mode == 'r':
 118             StringIO.__init__(self, ira_browse.db_datas)
 119         elif mode == 'w':
 120             StringIO.__init__(self, ira_browse.db_datas)
 121             # at write, we start at 0 (= overwrite), but have the original
 122             # data available, in case of a seek()
 123         elif mode == 'a':
 124             StringIO.__init__(self, None)
 125         else:
 126             logging.getLogger('document.storage').error("Incorrect mode %s specified", mode)
 127             raise IOError(errno.EINVAL, "Invalid file mode")
 128         self.mode = mode
 129
 130     def close(self):
 131         # we now open a *separate* cursor, to update the data.
 132         # FIXME: this may be improved, for concurrency handling
 133         par = self._get_parent()
 134         uid = par.context.uid
 135         cr = pooler.get_db(par.context.dbname).cursor()
 136         try:
 137             if self.mode == 'w':
 138                 out = self.getvalue()
 139                 cr.execute('UPDATE ir_attachment SET db_datas = %s, file_size=%d WHERE id = %s',
 140                     (out, len(out), par.file_id))
 141             elif self.mode == 'a':
 142                 out = self.getvalue()
 143                 cr.execute("UPDATE ir_attachment " \
 144                     "SET db_datas = COALESCE(db_datas,'') || %s, " \
 145                     "    file_size = COALESCE(file_size, 0) + %d " \
 146                     " WHERE id = %s",
 147                     (out, len(out), par.file_id))
 148             cr.commit()
 149         except Exception, e:
 150             logging.getLogger('document.storage').exception('Cannot update db file #%d for close:', par.file_id)
 151             raise
 152         finally:
 153             cr.close()
 154         StringIO.close(self)
 155
 156 class document_storage(osv.osv):
 157     """ The primary object for data storage.
 158     Each instance of this object is a storage media, in which our application
 159     can store contents. The object here controls the behaviour of the storage
 160     media.
 161     The referring document.directory-ies will control the placement of data
 162     into the storage.
 163
 164     It is a bad idea to have multiple document.storage objects pointing to
 165     the same tree of filesystem storage.
 166     """
 167     _name = 'document.storage'
 168     _description = 'Storage Media'
 169     _doclog = logging.getLogger('document')
 170
 171     _columns = {
 172         'name': fields.char('Name', size=64, required=True, select=1),
 173         'write_date': fields.datetime('Date Modified', readonly=True),
 174         'write_uid':  fields.many2one('res.users', 'Last Modification User', readonly=True),
 175         'create_date': fields.datetime('Date Created', readonly=True),
 176         'create_uid':  fields.many2one('res.users', 'Creator', readonly=True),
 177         'user_id': fields.many2one('res.users', 'Owner'),
 178         'group_ids': fields.many2many('res.groups', 'document_storage_group_rel', 'item_id', 'group_id', 'Groups'),
 179         'dir_ids': fields.one2many('document.directory', 'parent_id', 'Directories'),
 180         'type': fields.selection([('db', 'Database'), ('filestore', 'Internal File storage'),
 181             ('realstore', 'External file storage'), ('virtual', 'Virtual storage')], 'Type', required=True),
 182         'path': fields.char('Path', size=250, select=1, help="For file storage, the root path of the storage"),
 183         'online': fields.boolean('Online', help="If not checked, media is currently offline and its contents not available", required=True),
 184         'readonly': fields.boolean('Read Only', help="If set, media is for reading only"),
 185     }
 186
 187     def _get_rootpath(self, cr, uid, context=None):
 188         return os.path.join(DMS_ROOT_PATH, cr.dbname)
 189
 190     _defaults = {
 191         'user_id': lambda self, cr, uid, ctx: uid,
 192         'online': lambda *args: True,
 193         'readonly': lambda *args: False,
 194         # Note: the defaults below should only be used ONCE for the default
 195         # storage media. All other times, we should create different paths at least.
 196         'type': lambda *args: 'filestore',
 197         'path': _get_rootpath,
 198     }
 199     _sql_constraints = [
 200         # SQL note: a path = NULL doesn't have to be unique.
 201         ('path_uniq', 'UNIQUE(type,path)', "The storage path must be unique!")
 202         ]
 203
 204     def get_data(self, cr, uid, id, file_node, context=None, fil_obj=None):
 205         """ retrieve the contents of some file_node having storage_id = id
 206             optionally, fil_obj could point to the browse object of the file
 207             (ir.attachment)
 208         """
 209         if not context:
 210             context = {}
 211         boo = self.browse(cr, uid, id, context)
 212         if fil_obj:
 213             ira = fil_obj
 214         else:
 215             ira = self.pool.get('ir.attachment').browse(cr, uid, file_node.file_id, context=context)
 216         return self.__get_data_3(cr, uid, boo, ira, context)
 217
 218     def get_file(self, cr, uid, id, file_node, mode, context=None):
 219         if context is None:
 220             context = {}
 221         boo = self.browse(cr, uid, id, context)
 222         if not boo.online:
 223             raise RuntimeError('media offline')
 224
 225         ira = self.pool.get('ir.attachment').browse(cr, uid, file_node.file_id, context=context)
 226         if boo.type == 'filestore':
 227             if not ira.store_fname:
 228                 # On a migrated db, some files may have the wrong storage type
 229                 # try to fix their directory.
 230                 if ira.file_size:
 231                     self._doclog.warning( "ir.attachment #%d does not have a filename, but is at filestore, fix it!" % ira.id)
 232                 raise IOError(errno.ENOENT, 'No file can be located')
 233             fpath = os.path.join(boo.path, ira.store_fname)
 234             if self._debug:
 235                 self._doclog.debug("Trying to read \"%s\".."% fpath)
 236             return nodefd_file(file_node, path=fpath, mode=mode)
 237
 238         elif boo.type == 'db':
 239             # TODO: we need a better api for large files
 240             if self._debug:
 241                 self._doclog.debug("Trying to obtain db_datas for ir.attachment[%d]", ira.id)
 242             return nodefd_db(file_node, ira_browse=ira, mode=mode)
 243
 244         elif boo.type == 'realstore':
 245             if not ira.store_fname:
 246                 # On a migrated db, some files may have the wrong storage type
 247                 # try to fix their directory.
 248                 if ira.file_size:
 249                     self._doclog.warning("ir.attachment #%d does not have a filename, trying the name." %ira.id)
 250                 sfname = ira.name
 251             fpath = os.path.join(boo.path,ira.store_fname or ira.name)
 252             if not os.path.exists(fpath):
 253                 raise IOError("File not found: %s" % fpath)
 254             return nodefd_file(file_node, path=fpath, mode=mode)
 255
 256         else:
 257             raise TypeError("No %s storage" % boo.type)
 258
 259     def __get_data_3(self, cr, uid, boo, ira, context):
 260         if not boo.online:
 261             raise RuntimeError('media offline')
 262         if boo.type == 'filestore':
 263             if not ira.store_fname:
 264                 # On a migrated db, some files may have the wrong storage type
 265                 # try to fix their directory.
 266                 if ira.file_size:
 267                     self._doclog.warning( "ir.attachment #%d does not have a filename, but is at filestore, fix it!" % ira.id)
 268                 return None
 269             fpath = os.path.join(boo.path, ira.store_fname)
 270             return file(fpath, 'rb').read()
 271         elif boo.type == 'db':
 272             # TODO: we need a better api for large files
 273             if ira.db_datas:
 274                 out = base64.decodestring(ira.db_datas)
 275             else:
 276                 out = ''
 277             return out
 278         elif boo.type == 'realstore':
 279             if not ira.store_fname:
 280                 # On a migrated db, some files may have the wrong storage type
 281                 # try to fix their directory.
 282                 if ira.file_size:
 283                     self._doclog.warning("ir.attachment #%d does not have a filename, trying the name." %ira.id)
 284                 sfname = ira.name
 285             fpath = os.path.join(boo.path,ira.store_fname or ira.name)
 286             if os.path.exists(fpath):
 287                 return file(fpath,'rb').read()
 288             elif not ira.store_fname:
 289                 return None
 290             else:
 291                 raise IOError("File not found: %s" % fpath)
 292         else:
 293             raise TypeError("No %s storage" % boo.type)
 294
 295     def set_data(self, cr, uid, id, file_node, data, context=None, fil_obj=None):
 296         """ store the data.
 297             This function MUST be used from an ir.attachment. It wouldn't make sense
 298             to store things persistently for other types (dynamic).
 299         """
 300         if not context:
 301             context = {}
 302         boo = self.browse(cr, uid, id, context)
 303         if fil_obj:
 304             ira = fil_obj
 305         else:
 306             ira = self.pool.get('ir.attachment').browse(cr, uid, file_node.file_id, context=context)
 307
 308         if not boo.online:
 309             raise RuntimeError('media offline')
 310         self._doclog.debug( "Store data for ir.attachment #%d" % ira.id)
 311         store_fname = None
 312         fname = None
 313         if boo.type == 'filestore':
 314             path = boo.path
 315             try:
 316                 flag = None
 317                 # This can be improved
 318                 if os.path.isdir(path):
 319                     for dirs in os.listdir(path):
 320                         if os.path.isdir(os.path.join(path, dirs)) and len(os.listdir(os.path.join(path, dirs))) < 4000:
 321                             flag = dirs
 322                             break
 323                 flag = flag or create_directory(path)
 324                 filename = random_name()
 325                 fname = os.path.join(path, flag, filename)
 326                 fp = file(fname, 'wb')
 327                 fp.write(data)
 328                 fp.close()
 329                 self._doclog.debug( "Saved data to %s" % fname)
 330                 filesize = len(data) # os.stat(fname).st_size
 331                 store_fname = os.path.join(flag, filename)
 332
 333                 # TODO Here, an old file would be left hanging.
 334
 335             except Exception, e:
 336                 self._doclog.warning( "Couldn't save data to %s", path, exc_info=True)
 337                 raise except_orm(_('Error!'), str(e))
 338         elif boo.type == 'db':
 339             filesize = len(data)
 340             # will that work for huge data? TODO
 341             out = base64.encodestring(data)
 342             cr.execute('UPDATE ir_attachment SET db_datas = %s WHERE id = %s',
 343                 (out, file_node.file_id))
 344         elif boo.type == 'realstore':
 345             try:
 346                 file_node.fix_ppath(cr, ira)
 347                 npath = file_node.full_path() or []
 348                 # npath may contain empty elements, for root directory etc.
 349                 for i, n in enumerate(npath):
 350                     if n == None:
 351                         del npath[i]
 352                 for n in npath:
 353                     for ch in ('*', '|', "\\", '/', ':', '"', '<', '>', '?', '..'):
 354                         if ch in n:
 355                             raise ValueError("Invalid char %s in path %s" %(ch, n))
 356                 dpath = [boo.path,]
 357                 dpath += npath[:-1]
 358                 path = os.path.join(*dpath)
 359                 if not os.path.isdir(path):
 360                     os.makedirs(path)
 361                 fname = os.path.join(path, npath[-1])
 362                 fp = file(fname,'wb')
 363                 fp.write(data)
 364                 fp.close()
 365                 self._doclog.debug("Saved data to %s", fname)
 366                 filesize = len(data) # os.stat(fname).st_size
 367                 store_fname = os.path.join(*npath)
 368                 # TODO Here, an old file would be left hanging.
 369             except Exception,e :
 370                 self._doclog.warning("Couldn't save data:", exc_info=True)
 371                 raise except_orm(_('Error!'), str(e))
 372         else:
 373             raise TypeError("No %s storage" % boo.type)
 374
 375         # 2nd phase: store the metadata
 376         try:
 377             icont = ''
 378             mime = ira.file_type
 379             if not mime:
 380                 mime = ""
 381             try:
 382                 mime, icont = cntIndex.doIndex(data, ira.datas_fname,
 383                 ira.file_type or None, fname)
 384             except Exception:
 385                 self._doclog.debug('Cannot index file:', exc_info=True)
 386                 pass
 387
 388             try:
 389                 icont_u = ustr(icont)
 390             except UnicodeError:
 391                 icont_u = ''
 392
 393             # a hack: /assume/ that the calling write operation will not try
 394             # to write the fname and size, and update them in the db concurrently.
 395             # We cannot use a write() here, because we are already in one.
 396             cr.execute('UPDATE ir_attachment SET store_fname = %s, file_size = %s, index_content = %s, file_type = %s WHERE id = %s',
 397                 (store_fname, filesize, icont_u, mime, file_node.file_id))
 398             file_node.content_length = filesize
 399             file_node.content_type = mime
 400             return True
 401         except Exception, e :
 402             self._doclog.warning("Couldn't save data:", exc_info=True)
 403             # should we really rollback once we have written the actual data?
 404             # at the db case (only), that rollback would be safe
 405             raise except_orm(_('Error at doc write!'), str(e))
 406
 407     def prepare_unlink(self, cr, uid, storage_bo, fil_bo):
 408         """ Before we unlink a file (fil_boo), prepare the list of real
 409         files that have to be removed, too. """
 410
 411         if not storage_bo.online:
 412             raise RuntimeError('media offline')
 413
 414         if storage_bo.type == 'filestore':
 415             fname = fil_bo.store_fname
 416             if not fname:
 417                 return None
 418             path = storage_bo.path
 419             return (storage_bo.id, 'file', os.path.join(path, fname))
 420         elif storage_bo.type == 'db':
 421             return None
 422         elif storage_bo.type == 'realstore':
 423             fname = fil_bo.store_fname
 424             if not fname:
 425                 return None
 426             path = storage_bo.path
 427             return ( storage_bo.id, 'file', os.path.join(path, fname))
 428         else:
 429             raise TypeError("No %s storage" % boo.type)
 430
 431     def do_unlink(self, cr, uid, unres):
 432         for id, ktype, fname in unres:
 433             if ktype == 'file':
 434                 try:
 435                     os.unlink(fname)
 436                 except Exception, e:
 437                     self._doclog.warning("Could not remove file %s, please remove manually.", fname, exc_info=True)
 438             else:
 439                 self._doclog.warning("Unknown unlink key %s" % ktype)
 440
 441         return True
 442
 443
 444 document_storage()
 445
 446
 447 #eof