addons/document/document_storage.py

   1 # -*- encoding: utf-8 -*-
   2 ##############################################################################
   3 #
   4 #    OpenERP, Open Source Management Solution
   5 #
   6 #    Copyright (C) P. Christeas, 2009, all rights reserved
   7 #
   8 #    This program is free software: you can redistribute it and/or modify
   9 #    it under the terms of the GNU General Public License as published by
  10 #    the Free Software Foundation, either version 3 of the License, or
  11 #    (at your option) any later version.
  12 #
  13 #    This program is distributed in the hope that it will be useful,
  14 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 #    GNU General Public License for more details.
  17 #
  18 #    You should have received a copy of the GNU General Public License
  19 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  20 #
  21 ##############################################################################
  22
  23 from osv import osv, fields
  24 import os
  25 import tools
  26 import base64
  27 import errno
  28 import logging
  29 from StringIO import StringIO
  30
  31 from tools.misc import ustr
  32 from tools.translate import _
  33
  34 from osv.orm import except_orm
  35
  36 import random
  37 import string
  38 import netsvc
  39 import nodes
  40 from content_index import cntIndex
  41
  42 DMS_ROOT_PATH = tools.config.get('document_path', os.path.join(tools.config.get('root_path'), 'filestore'))
  43
  44
  45 """ The algorithm of data storage
  46
  47 We have to consider 3 cases of data /retrieval/:
  48  Given (context,path) we need to access the file (aka. node).
  49  given (directory, context), we need one of its children (for listings, views)
  50  given (ir.attachment, context), we needs its data and metadata (node).
  51
  52 For data /storage/ we have the cases:
  53  Have (ir.attachment, context), we modify the file (save, update, rename etc).
  54  Have (directory, context), we create a file.
  55  Have (path, context), we create or modify a file.
  56
  57 Note that in all above cases, we don't explicitly choose the storage media,
  58 but always require a context to be present.
  59
  60 Note that a node will not always have a corresponding ir.attachment. Dynamic
  61 nodes, for once, won't. Their metadata will be computed by the parent storage
  62 media + directory.
  63
  64 The algorithm says that in any of the above cases, our first goal is to locate
  65 the node for any combination of search criteria. It would be wise NOT to
  66 represent each node in the path (like node[/] + node[/dir1] + node[/dir1/dir2])
  67 but directly jump to the end node (like node[/dir1/dir2]) whenever possible.
  68
  69 We also contain all the parenting loop code in one function. This is intentional,
  70 because one day this will be optimized in the db (Pg 8.4).
  71
  72
  73 """
  74
  75 def random_name():
  76     random.seed()
  77     d = [random.choice(string.ascii_letters) for x in xrange(10) ]
  78     name = "".join(d)
  79     return name
  80
  81 INVALID_CHARS = {'*':str(hash('*')), '|':str(hash('|')) , "\\":str(hash("\\")), '/':'__', ':':str(hash(':')), '"':str(hash('"')), '<':str(hash('<')) , '>':str(hash('>')) , '?':str(hash('?'))}
  82
  83
  84 def create_directory(path):
  85     dir_name = random_name()
  86     path = os.path.join(path, dir_name)
  87     os.makedirs(path)
  88     return dir_name
  89
  90 class nodefd_file(nodes.node_descriptor):
  91     """ A descriptor to a real file
  92
  93     Inheriting directly from file doesn't work, since file exports
  94     some read-only attributes (like 'name') that we don't like.
  95     """
  96     def __init__(self, parent, path, mode):
  97         nodes.node_descriptor.__init__(self, parent)
  98         self.__file = open(path, mode)
  99
 100         for attr in ('closed', 'read', 'write', 'seek', 'tell'):
 101             setattr(self,attr, getattr(self.__file, attr))
 102
 103     def close(self):
 104         # TODO: locking in init, close()
 105         self.__file.close()
 106
 107
 108 class nodefd_db(StringIO, nodes.node_descriptor):
 109     """ A descriptor to db data
 110     """
 111     def __init__(self, parent, ira_browse, mode):
 112         nodes.node_descriptor.__init__(self, parent)
 113         if mode.endswith('b'):
 114             mode = mode[:-1]
 115
 116         if mode == 'r':
 117             StringIO.__init__(self, ira_browse.db_datas)
 118         elif mode == 'w':
 119             StringIO.__init__(self, ira_browse.db_datas)
 120             # at write, we start at 0 (= overwrite), but have the original
 121             # data available, in case of a seek()
 122         elif mode == 'a':
 123             StringIO.__init__(self, None)
 124         else:
 125             logging.getLogger('document.storage').error("Incorrect mode %s specified", mode)
 126             raise IOError(errno.EINVAL, "Invalid file mode")
 127         self.mode = mode
 128
 129     def close(self):
 130         # we now open a *separate* cursor, to update the data.
 131         # FIXME: this may be improved, for concurrency handling
 132         uid = self.__parent.context.uid
 133         cr = pooler.get_db(self.__parent.context.dbname).cursor()
 134         if mode == 'w':
 135             out = self.getvalue()
 136             cr.execute('UPDATE ir_attachment SET db_datas = %s, file_size=%d WHERE id = %s',
 137                 (out, len(out), self.__parent.file_id))
 138         elif mode == 'a':
 139             out = self.getvalue()
 140             cr.execute("UPDATE ir_attachment " \
 141                 "SET db_datas = COALESCE(db_datas,'') || %s, " \
 142                 "    file_size = COALESCE(file_size, 0) + %d " \
 143                 " WHERE id = %s",
 144                 (out, len(out), self.__parent.file_id))
 145         cr.commit()
 146         cr.close()
 147         StringIO.close(self)
 148
 149 class document_storage(osv.osv):
 150     """ The primary object for data storage.
 151     Each instance of this object is a storage media, in which our application
 152     can store contents. The object here controls the behaviour of the storage
 153     media.
 154     The referring document.directory-ies will control the placement of data
 155     into the storage.
 156
 157     It is a bad idea to have multiple document.storage objects pointing to
 158     the same tree of filesystem storage.
 159     """
 160     _name = 'document.storage'
 161     _description = 'Storage Media'
 162     _doclog = logging.getLogger('document')
 163
 164     _columns = {
 165         'name': fields.char('Name', size=64, required=True, select=1),
 166         'write_date': fields.datetime('Date Modified', readonly=True),
 167         'write_uid':  fields.many2one('res.users', 'Last Modification User', readonly=True),
 168         'create_date': fields.datetime('Date Created', readonly=True),
 169         'create_uid':  fields.many2one('res.users', 'Creator', readonly=True),
 170         'user_id': fields.many2one('res.users', 'Owner'),
 171         'group_ids': fields.many2many('res.groups', 'document_storage_group_rel', 'item_id', 'group_id', 'Groups'),
 172         'dir_ids': fields.one2many('document.directory', 'parent_id', 'Directories'),
 173         'type': fields.selection([('db', 'Database'), ('filestore', 'Internal File storage'),
 174             ('realstore', 'External file storage'), ('virtual', 'Virtual storage')], 'Type', required=True),
 175         'path': fields.char('Path', size=250, select=1, help="For file storage, the root path of the storage"),
 176         'online': fields.boolean('Online', help="If not checked, media is currently offline and its contents not available", required=True),
 177         'readonly': fields.boolean('Read Only', help="If set, media is for reading only"),
 178     }
 179
 180     def _get_rootpath(self, cr, uid, context=None):
 181         return os.path.join(DMS_ROOT_PATH, cr.dbname)
 182
 183     _defaults = {
 184         'user_id': lambda self, cr, uid, ctx: uid,
 185         'online': lambda *args: True,
 186         'readonly': lambda *args: False,
 187         # Note: the defaults below should only be used ONCE for the default
 188         # storage media. All other times, we should create different paths at least.
 189         'type': lambda *args: 'filestore',
 190         'path': _get_rootpath,
 191     }
 192     _sql_constraints = [
 193         # SQL note: a path = NULL doesn't have to be unique.
 194         ('path_uniq', 'UNIQUE(type,path)', "The storage path must be unique!")
 195         ]
 196
 197     def get_data(self, cr, uid, id, file_node, context=None, fil_obj=None):
 198         """ retrieve the contents of some file_node having storage_id = id
 199             optionally, fil_obj could point to the browse object of the file
 200             (ir.attachment)
 201         """
 202         if not context:
 203             context = {}
 204         boo = self.browse(cr, uid, id, context)
 205         if fil_obj:
 206             ira = fil_obj
 207         else:
 208             ira = self.pool.get('ir.attachment').browse(cr, uid, file_node.file_id, context=context)
 209         return self.__get_data_3(cr, uid, boo, ira, context)
 210
 211     def get_file(self, cr, uid, id, file_node, mode, context=None):
 212         if context is None:
 213             context = {}
 214         boo = self.browse(cr, uid, id, context)
 215         if not boo.online:
 216             raise RuntimeError('media offline')
 217
 218         ira = self.pool.get('ir.attachment').browse(cr, uid, file_node.file_id, context=context)
 219         if boo.type == 'filestore':
 220             if not ira.store_fname:
 221                 # On a migrated db, some files may have the wrong storage type
 222                 # try to fix their directory.
 223                 if ira.file_size:
 224                     self._doclog.warning( "ir.attachment #%d does not have a filename, but is at filestore, fix it!" % ira.id)
 225                 raise IOError(errno.ENOENT, 'No file can be located')
 226             fpath = os.path.join(boo.path, ira.store_fname)
 227             if self._debug:
 228                 self._doclog.debug("Trying to read \"%s\".."% fpath)
 229             return nodefd_file(file_node, path=fpath, mode=mode)
 230
 231         elif boo.type == 'db':
 232             # TODO: we need a better api for large files
 233             if self._debug:
 234                 self._doclog.debug("Trying to obtain db_datas for ir.attachment[%d]", ira.id)
 235             return nodefd_db(file_node, ira_browse=ira, mode=mode)
 236
 237         elif boo.type == 'realstore':
 238             if not ira.store_fname:
 239                 # On a migrated db, some files may have the wrong storage type
 240                 # try to fix their directory.
 241                 if ira.file_size:
 242                     self._doclog.warning("ir.attachment #%d does not have a filename, trying the name." %ira.id)
 243                 sfname = ira.name
 244             fpath = os.path.join(boo.path,ira.store_fname or ira.name)
 245             if not os.path.exists(fpath):
 246                 raise IOError("File not found: %s" % fpath)
 247             return nodefd_file(file_node, path=fpath, mode=mode)
 248
 249         else:
 250             raise TypeError("No %s storage" % boo.type)
 251
 252     def __get_data_3(self, cr, uid, boo, ira, context):
 253         if not boo.online:
 254             raise RuntimeError('media offline')
 255         if boo.type == 'filestore':
 256             if not ira.store_fname:
 257                 # On a migrated db, some files may have the wrong storage type
 258                 # try to fix their directory.
 259                 if ira.file_size:
 260                     self._doclog.warning( "ir.attachment #%d does not have a filename, but is at filestore, fix it!" % ira.id)
 261                 return None
 262             fpath = os.path.join(boo.path, ira.store_fname)
 263             return file(fpath, 'rb').read()
 264         elif boo.type == 'db':
 265             # TODO: we need a better api for large files
 266             if ira.db_datas:
 267                 out = base64.decodestring(ira.db_datas)
 268             else:
 269                 out = ''
 270             return out
 271         elif boo.type == 'realstore':
 272             if not ira.store_fname:
 273                 # On a migrated db, some files may have the wrong storage type
 274                 # try to fix their directory.
 275                 if ira.file_size:
 276                     self._doclog.warning("ir.attachment #%d does not have a filename, trying the name." %ira.id)
 277                 sfname = ira.name
 278             fpath = os.path.join(boo.path,ira.store_fname or ira.name)
 279             if os.path.exists(fpath):
 280                 return file(fpath,'rb').read()
 281             elif not ira.store_fname:
 282                 return None
 283             else:
 284                 raise IOError("File not found: %s" % fpath)
 285         else:
 286             raise TypeError("No %s storage" % boo.type)
 287
 288     def set_data(self, cr, uid, id, file_node, data, context=None, fil_obj=None):
 289         """ store the data.
 290             This function MUST be used from an ir.attachment. It wouldn't make sense
 291             to store things persistently for other types (dynamic).
 292         """
 293         if not context:
 294             context = {}
 295         boo = self.browse(cr, uid, id, context)
 296         if fil_obj:
 297             ira = fil_obj
 298         else:
 299             ira = self.pool.get('ir.attachment').browse(cr, uid, file_node.file_id, context=context)
 300
 301         if not boo.online:
 302             raise RuntimeError('media offline')
 303         self._doclog.debug( "Store data for ir.attachment #%d" % ira.id)
 304         store_fname = None
 305         fname = None
 306         if boo.type == 'filestore':
 307             path = boo.path
 308             try:
 309                 flag = None
 310                 # This can be improved
 311                 if os.path.isdir(path):
 312                     for dirs in os.listdir(path):
 313                         if os.path.isdir(os.path.join(path, dirs)) and len(os.listdir(os.path.join(path, dirs))) < 4000:
 314                             flag = dirs
 315                             break
 316                 flag = flag or create_directory(path)
 317                 filename = random_name()
 318                 fname = os.path.join(path, flag, filename)
 319                 fp = file(fname, 'wb')
 320                 fp.write(data)
 321                 fp.close()
 322                 self._doclog.debug( "Saved data to %s" % fname)
 323                 filesize = len(data) # os.stat(fname).st_size
 324                 store_fname = os.path.join(flag, filename)
 325
 326                 # TODO Here, an old file would be left hanging.
 327
 328             except Exception, e:
 329                 self._doclog.warning( "Couldn't save data to %s", path, exc_info=True)
 330                 raise except_orm(_('Error!'), str(e))
 331         elif boo.type == 'db':
 332             filesize = len(data)
 333             # will that work for huge data? TODO
 334             out = base64.encodestring(data)
 335             cr.execute('UPDATE ir_attachment SET db_datas = %s WHERE id = %s',
 336                 (out, file_node.file_id))
 337         elif boo.type == 'realstore':
 338             try:
 339                 file_node.fix_ppath(cr, ira)
 340                 npath = file_node.full_path() or []
 341                 # npath may contain empty elements, for root directory etc.
 342                 for i, n in enumerate(npath):
 343                     if n == None:
 344                         del npath[i]
 345                 for n in npath:
 346                     for ch in ('*', '|', "\\", '/', ':', '"', '<', '>', '?', '..'):
 347                         if ch in n:
 348                             raise ValueError("Invalid char %s in path %s" %(ch, n))
 349                 dpath = [boo.path,]
 350                 dpath += npath[:-1]
 351                 path = os.path.join(*dpath)
 352                 if not os.path.isdir(path):
 353                     os.makedirs(path)
 354                 fname = os.path.join(path, npath[-1])
 355                 fp = file(fname,'wb')
 356                 fp.write(data)
 357                 fp.close()
 358                 self._doclog.debug("Saved data to %s", fname)
 359                 filesize = len(data) # os.stat(fname).st_size
 360                 store_fname = os.path.join(*npath)
 361                 # TODO Here, an old file would be left hanging.
 362             except Exception,e :
 363                 self._doclog.warning("Couldn't save data:", exc_info=True)
 364                 raise except_orm(_('Error!'), str(e))
 365         else:
 366             raise TypeError("No %s storage" % boo.type)
 367
 368         # 2nd phase: store the metadata
 369         try:
 370             icont = ''
 371             mime = ira.file_type
 372             if not mime:
 373                 mime = ""
 374             try:
 375                 mime, icont = cntIndex.doIndex(data, ira.datas_fname,
 376                 ira.file_type or None, fname)
 377             except Exception:
 378                 self._doclog.debug('Cannot index file:', exc_info=True)
 379                 pass
 380
 381             try:
 382                 icont_u = ustr(icont)
 383             except UnicodeError:
 384                 icont_u = ''
 385
 386             # a hack: /assume/ that the calling write operation will not try
 387             # to write the fname and size, and update them in the db concurrently.
 388             # We cannot use a write() here, because we are already in one.
 389             cr.execute('UPDATE ir_attachment SET store_fname = %s, file_size = %s, index_content = %s, file_type = %s WHERE id = %s',
 390                 (store_fname, filesize, icont_u, mime, file_node.file_id))
 391             file_node.content_length = filesize
 392             file_node.content_type = mime
 393             return True
 394         except Exception, e :
 395             self._doclog.warning("Couldn't save data:", exc_info=True)
 396             # should we really rollback once we have written the actual data?
 397             # at the db case (only), that rollback would be safe
 398             raise except_orm(_('Error at doc write!'), str(e))
 399
 400     def prepare_unlink(self, cr, uid, storage_bo, fil_bo):
 401         """ Before we unlink a file (fil_boo), prepare the list of real
 402         files that have to be removed, too. """
 403
 404         if not storage_bo.online:
 405             raise RuntimeError('media offline')
 406
 407         if storage_bo.type == 'filestore':
 408             fname = fil_bo.store_fname
 409             if not fname:
 410                 return None
 411             path = storage_bo.path
 412             return (storage_bo.id, 'file', os.path.join(path, fname))
 413         elif storage_bo.type == 'db':
 414             return None
 415         elif storage_bo.type == 'realstore':
 416             fname = fil_bo.store_fname
 417             if not fname:
 418                 return None
 419             path = storage_bo.path
 420             return ( storage_bo.id, 'file', os.path.join(path, fname))
 421         else:
 422             raise TypeError("No %s storage" % boo.type)
 423
 424     def do_unlink(self, cr, uid, unres):
 425         for id, ktype, fname in unres:
 426             if ktype == 'file':
 427                 try:
 428                     os.unlink(fname)
 429                 except Exception, e:
 430                     self._doclog.warning("Could not remove file %s, please remove manually.", fname, exc_info=True)
 431             else:
 432                 self._doclog.warning("Unknown unlink key %s" % ktype)
 433
 434         return True
 435
 436
 437 document_storage()
 438
 439
 440 #eof