addons/document/document_storage.py

   1 # -*- encoding: utf-8 -*-
   2 ##############################################################################
   3 #
   4 #    OpenERP, Open Source Management Solution
   5 #
   6 #    Copyright (C) P. Christeas, 2009, all rights reserved
   7 #
   8 #    This program is free software: you can redistribute it and/or modify
   9 #    it under the terms of the GNU General Public License as published by
  10 #    the Free Software Foundation, either version 3 of the License, or
  11 #    (at your option) any later version.
  12 #
  13 #    This program is distributed in the hope that it will be useful,
  14 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 #    GNU General Public License for more details.
  17 #
  18 #    You should have received a copy of the GNU General Public License
  19 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  20 #
  21 ##############################################################################
  22
  23 from osv import osv, fields
  24 import os
  25 import tools
  26 import base64
  27 import errno
  28 import logging
  29 from StringIO import StringIO
  30
  31 from tools.misc import ustr
  32 from tools.translate import _
  33
  34 from osv.orm import except_orm
  35
  36 import random
  37 import string
  38 import pooler
  39 import netsvc
  40 import nodes
  41 from content_index import cntIndex
  42
  43 DMS_ROOT_PATH = tools.config.get('document_path', os.path.join(tools.config.get('root_path'), 'filestore'))
  44
  45
  46 """ The algorithm of data storage
  47
  48 We have to consider 3 cases of data /retrieval/:
  49  Given (context,path) we need to access the file (aka. node).
  50  given (directory, context), we need one of its children (for listings, views)
  51  given (ir.attachment, context), we needs its data and metadata (node).
  52
  53 For data /storage/ we have the cases:
  54  Have (ir.attachment, context), we modify the file (save, update, rename etc).
  55  Have (directory, context), we create a file.
  56  Have (path, context), we create or modify a file.
  57
  58 Note that in all above cases, we don't explicitly choose the storage media,
  59 but always require a context to be present.
  60
  61 Note that a node will not always have a corresponding ir.attachment. Dynamic
  62 nodes, for once, won't. Their metadata will be computed by the parent storage
  63 media + directory.
  64
  65 The algorithm says that in any of the above cases, our first goal is to locate
  66 the node for any combination of search criteria. It would be wise NOT to
  67 represent each node in the path (like node[/] + node[/dir1] + node[/dir1/dir2])
  68 but directly jump to the end node (like node[/dir1/dir2]) whenever possible.
  69
  70 We also contain all the parenting loop code in one function. This is intentional,
  71 because one day this will be optimized in the db (Pg 8.4).
  72
  73
  74 """
  75
  76 def random_name():
  77     random.seed()
  78     d = [random.choice(string.ascii_letters) for x in xrange(10) ]
  79     name = "".join(d)
  80     return name
  81
  82 INVALID_CHARS = {'*':str(hash('*')), '|':str(hash('|')) , "\\":str(hash("\\")), '/':'__', ':':str(hash(':')), '"':str(hash('"')), '<':str(hash('<')) , '>':str(hash('>')) , '?':str(hash('?'))}
  83
  84
  85 def create_directory(path):
  86     dir_name = random_name()
  87     path = os.path.join(path, dir_name)
  88     os.makedirs(path)
  89     return dir_name
  90
  91 class nodefd_file(nodes.node_descriptor):
  92     """ A descriptor to a real file
  93
  94     Inheriting directly from file doesn't work, since file exports
  95     some read-only attributes (like 'name') that we don't like.
  96     """
  97     def __init__(self, parent, path, mode):
  98         nodes.node_descriptor.__init__(self, parent)
  99         self.__file = open(path, mode)
 100
 101         for attr in ('closed', 'read', 'write', 'seek', 'tell'):
 102             setattr(self,attr, getattr(self.__file, attr))
 103
 104     def close(self):
 105         # TODO: locking in init, close()
 106         self.__file.close()
 107
 108
 109 class nodefd_db(StringIO, nodes.node_descriptor):
 110     """ A descriptor to db data
 111     """
 112     def __init__(self, parent, ira_browse, mode):
 113         nodes.node_descriptor.__init__(self, parent)
 114         if mode.endswith('b'):
 115             mode = mode[:-1]
 116
 117         if mode in ('r', 'r+'):
 118             cr.execute('SELECT db_datas FROM ir_attachment WHERE id = %s', ira_browse.id)
 119             data = cr.fetchone()[0]
 120             StringIO.__init__(self, data)
 121         elif mode in ('w', 'w+'):
 122             StringIO.__init__(self, None)
 123             # at write, we start at 0 (= overwrite), but have the original
 124             # data available, in case of a seek()
 125         elif mode == 'a':
 126             StringIO.__init__(self, None)
 127         else:
 128             logging.getLogger('document.storage').error("Incorrect mode %s specified", mode)
 129             raise IOError(errno.EINVAL, "Invalid file mode")
 130         self.mode = mode
 131
 132     def close(self):
 133         # we now open a *separate* cursor, to update the data.
 134         # FIXME: this may be improved, for concurrency handling
 135         par = self._get_parent()
 136         uid = par.context.uid
 137         cr = pooler.get_db(par.context.dbname).cursor()
 138         try:
 139             if self.mode in ('w', 'w+', 'r+'):
 140                 out = self.getvalue()
 141                 cr.execute("UPDATE ir_attachment SET db_datas = decode(%s,'escape'), file_size=%s WHERE id = %s",
 142                     (out, len(out), par.file_id))
 143             elif self.mode == 'a':
 144                 out = self.getvalue()
 145                 cr.execute("UPDATE ir_attachment " \
 146                     "SET db_datas = COALESCE(db_datas,'') || decode(%s, 'escape'), " \
 147                     "    file_size = COALESCE(file_size, 0) + %s " \
 148                     " WHERE id = %s",
 149                     (out, len(out), par.file_id))
 150             cr.commit()
 151         except Exception, e:
 152             logging.getLogger('document.storage').exception('Cannot update db file #%d for close:', par.file_id)
 153             raise
 154         finally:
 155             cr.close()
 156         StringIO.close(self)
 157
 158 class nodefd_db64(StringIO, nodes.node_descriptor):
 159     """ A descriptor to db data, base64 (the old way)
 160
 161         It stores the data in base64 encoding at the db. Not optimal, but
 162         the transparent compression of Postgres will save the day.
 163     """
 164     def __init__(self, parent, ira_browse, mode):
 165         nodes.node_descriptor.__init__(self, parent)
 166         if mode.endswith('b'):
 167             mode = mode[:-1]
 168
 169         if mode in ('r', 'r+'):
 170             StringIO.__init__(self, base64.decodestring(ira_browse.db_datas))
 171         elif mode in ('w', 'w+'):
 172             StringIO.__init__(self, None)
 173             # at write, we start at 0 (= overwrite), but have the original
 174             # data available, in case of a seek()
 175         elif mode == 'a':
 176             StringIO.__init__(self, None)
 177         else:
 178             logging.getLogger('document.storage').error("Incorrect mode %s specified", mode)
 179             raise IOError(errno.EINVAL, "Invalid file mode")
 180         self.mode = mode
 181
 182     def close(self):
 183         # we now open a *separate* cursor, to update the data.
 184         # FIXME: this may be improved, for concurrency handling
 185         par = self._get_parent()
 186         uid = par.context.uid
 187         cr = pooler.get_db(par.context.dbname).cursor()
 188         try:
 189             if self.mode in ('w', 'w+', 'r+'):
 190                 out = self.getvalue()
 191                 cr.execute('UPDATE ir_attachment SET db_datas = %s::bytea, file_size=%s WHERE id = %s',
 192                     (base64.encodestring(out), len(out), par.file_id))
 193             elif self.mode == 'a':
 194                 out = self.getvalue()
 195                 # Yes, we're obviously using the wrong representation for storing our
 196                 # data as base64-in-bytea
 197                 cr.execute("UPDATE ir_attachment " \
 198                     "SET db_datas = encode( (COALESCE(decode(encode(db_datas,'escape'),'base64'),'') || decode(%s, 'base64')),'base64')::bytea , " \
 199                     "    file_size = COALESCE(file_size, 0) + %s " \
 200                     " WHERE id = %s",
 201                     (base64.encodestring(out), len(out), par.file_id))
 202             cr.commit()
 203         except Exception, e:
 204             logging.getLogger('document.storage').exception('Cannot update db file #%d for close:', par.file_id)
 205             raise
 206         finally:
 207             cr.close()
 208         StringIO.close(self)
 209
 210 class document_storage(osv.osv):
 211     """ The primary object for data storage.
 212     Each instance of this object is a storage media, in which our application
 213     can store contents. The object here controls the behaviour of the storage
 214     media.
 215     The referring document.directory-ies will control the placement of data
 216     into the storage.
 217
 218     It is a bad idea to have multiple document.storage objects pointing to
 219     the same tree of filesystem storage.
 220     """
 221     _name = 'document.storage'
 222     _description = 'Storage Media'
 223     _doclog = logging.getLogger('document')
 224
 225     _columns = {
 226         'name': fields.char('Name', size=64, required=True, select=1),
 227         'write_date': fields.datetime('Date Modified', readonly=True),
 228         'write_uid':  fields.many2one('res.users', 'Last Modification User', readonly=True),
 229         'create_date': fields.datetime('Date Created', readonly=True),
 230         'create_uid':  fields.many2one('res.users', 'Creator', readonly=True),
 231         'user_id': fields.many2one('res.users', 'Owner'),
 232         'group_ids': fields.many2many('res.groups', 'document_storage_group_rel', 'item_id', 'group_id', 'Groups'),
 233         'dir_ids': fields.one2many('document.directory', 'parent_id', 'Directories'),
 234         'type': fields.selection([('db', 'Database'), ('filestore', 'Internal File storage'),
 235                 ('realstore','External file storage'),], 'Type', required=True),
 236         'path': fields.char('Path', size=250, select=1, help="For file storage, the root path of the storage"),
 237         'online': fields.boolean('Online', help="If not checked, media is currently offline and its contents not available", required=True),
 238         'readonly': fields.boolean('Read Only', help="If set, media is for reading only"),
 239     }
 240
 241     def _get_rootpath(self, cr, uid, context=None):
 242         return os.path.join(DMS_ROOT_PATH, cr.dbname)
 243
 244     _defaults = {
 245         'user_id': lambda self, cr, uid, ctx: uid,
 246         'online': lambda *args: True,
 247         'readonly': lambda *args: False,
 248         # Note: the defaults below should only be used ONCE for the default
 249         # storage media. All other times, we should create different paths at least.
 250         'type': lambda *args: 'filestore',
 251         'path': _get_rootpath,
 252     }
 253     _sql_constraints = [
 254         # SQL note: a path = NULL doesn't have to be unique.
 255         ('path_uniq', 'UNIQUE(type,path)', "The storage path must be unique!")
 256         ]
 257
 258     def get_data(self, cr, uid, id, file_node, context=None, fil_obj=None):
 259         """ retrieve the contents of some file_node having storage_id = id
 260             optionally, fil_obj could point to the browse object of the file
 261             (ir.attachment)
 262         """
 263         if not context:
 264             context = {}
 265         boo = self.browse(cr, uid, id, context)
 266         if fil_obj:
 267             ira = fil_obj
 268         else:
 269             ira = self.pool.get('ir.attachment').browse(cr, uid, file_node.file_id, context=context)
 270         return self.__get_data_3(cr, uid, boo, ira, context)
 271
 272     def get_file(self, cr, uid, id, file_node, mode, context=None):
 273         """ Return a file-like object for the contents of some node
 274         """
 275         if context is None:
 276             context = {}
 277         boo = self.browse(cr, uid, id, context)
 278         if not boo.online:
 279             raise RuntimeError('media offline')
 280
 281         ira = self.pool.get('ir.attachment').browse(cr, uid, file_node.file_id, context=context)
 282         if boo.type == 'filestore':
 283             if not ira.store_fname:
 284                 # On a migrated db, some files may have the wrong storage type
 285                 # try to fix their directory.
 286                 if ira.file_size:
 287                     self._doclog.warning( "ir.attachment #%d does not have a filename, but is at filestore, fix it!" % ira.id)
 288                 raise IOError(errno.ENOENT, 'No file can be located')
 289             fpath = os.path.join(boo.path, ira.store_fname)
 290             return nodefd_file(file_node, path=fpath, mode=mode)
 291
 292         elif boo.type == 'db':
 293             # TODO: we need a better api for large files
 294             return nodefd_db(file_node, ira_browse=ira, mode=mode)
 295
 296         elif boo.type == 'db64':
 297             return nodefd_db64(file_node, ira_browse=ira, mode=mode)
 298
 299         elif boo.type == 'realstore':
 300             if not ira.store_fname:
 301                 # On a migrated db, some files may have the wrong storage type
 302                 # try to fix their directory.
 303                 if ira.file_size:
 304                     self._doclog.warning("ir.attachment #%d does not have a filename, trying the name." %ira.id)
 305                 sfname = ira.name
 306             fpath = os.path.join(boo.path,ira.store_fname or ira.name)
 307             if not os.path.exists(fpath):
 308                 raise IOError("File not found: %s" % fpath)
 309             return nodefd_file(file_node, path=fpath, mode=mode)
 310
 311         elif boo.type == 'virtual':
 312             raise ValueError('Virtual storage does not support static files')
 313
 314         else:
 315             raise TypeError("No %s storage" % boo.type)
 316
 317     def __get_data_3(self, cr, uid, boo, ira, context):
 318         if not boo.online:
 319             raise RuntimeError('media offline')
 320         if boo.type == 'filestore':
 321             if not ira.store_fname:
 322                 # On a migrated db, some files may have the wrong storage type
 323                 # try to fix their directory.
 324                 if ira.file_size:
 325                     self._doclog.warning( "ir.attachment #%d does not have a filename, but is at filestore, fix it!" % ira.id)
 326                 return None
 327             fpath = os.path.join(boo.path, ira.store_fname)
 328             return file(fpath, 'rb').read()
 329         elif boo.type == 'db64':
 330             # TODO: we need a better api for large files
 331             if ira.db_datas:
 332                 out = base64.decodestring(ira.db_datas)
 333             else:
 334                 out = ''
 335             return out
 336         elif boo.type == 'db':
 337             # We do an explicit query, to avoid type transformations.
 338             cr.execute('SELECT db_datas FROM ir_attachment WHERE id = %s', (ira.id,))
 339             res = cr.fetchone()
 340             if res:
 341                 return res[0]
 342             else:
 343                 return ''
 344         elif boo.type == 'realstore':
 345             if not ira.store_fname:
 346                 # On a migrated db, some files may have the wrong storage type
 347                 # try to fix their directory.
 348                 if ira.file_size:
 349                     self._doclog.warning("ir.attachment #%d does not have a filename, trying the name." %ira.id)
 350                 sfname = ira.name
 351             fpath = os.path.join(boo.path,ira.store_fname or ira.name)
 352             if os.path.exists(fpath):
 353                 return file(fpath,'rb').read()
 354             elif not ira.store_fname:
 355                 return None
 356             else:
 357                 raise IOError("File not found: %s" % fpath)
 358
 359         elif boo.type == 'virtual':
 360             raise ValueError('Virtual storage does not support static files')
 361
 362         else:
 363             raise TypeError("No %s storage" % boo.type)
 364
 365     def set_data(self, cr, uid, id, file_node, data, context=None, fil_obj=None):
 366         """ store the data.
 367             This function MUST be used from an ir.attachment. It wouldn't make sense
 368             to store things persistently for other types (dynamic).
 369         """
 370         if not context:
 371             context = {}
 372         boo = self.browse(cr, uid, id, context)
 373         if fil_obj:
 374             ira = fil_obj
 375         else:
 376             ira = self.pool.get('ir.attachment').browse(cr, uid, file_node.file_id, context=context)
 377
 378         if not boo.online:
 379             raise RuntimeError('media offline')
 380         self._doclog.debug( "Store data for ir.attachment #%d" % ira.id)
 381         store_fname = None
 382         fname = None
 383         if boo.type == 'filestore':
 384             path = boo.path
 385             try:
 386                 flag = None
 387                 # This can be improved
 388                 if os.path.isdir(path):
 389                     for dirs in os.listdir(path):
 390                         if os.path.isdir(os.path.join(path, dirs)) and len(os.listdir(os.path.join(path, dirs))) < 4000:
 391                             flag = dirs
 392                             break
 393                 flag = flag or create_directory(path)
 394                 filename = random_name()
 395                 fname = os.path.join(path, flag, filename)
 396                 fp = file(fname, 'wb')
 397                 fp.write(data)
 398                 fp.close()
 399                 self._doclog.debug( "Saved data to %s" % fname)
 400                 filesize = len(data) # os.stat(fname).st_size
 401                 store_fname = os.path.join(flag, filename)
 402
 403                 # TODO Here, an old file would be left hanging.
 404
 405             except Exception, e:
 406                 self._doclog.warning( "Couldn't save data to %s", path, exc_info=True)
 407                 raise except_orm(_('Error!'), str(e))
 408         elif boo.type == 'db':
 409             filesize = len(data)
 410             # will that work for huge data?
 411             cr.execute('UPDATE ir_attachment SET db_datas = %s WHERE id = %s',
 412                 (data, file_node.file_id))
 413         elif boo.type == 'db64':
 414             filesize = len(data)
 415             # will that work for huge data?
 416             out = base64.encodestring(data)
 417             cr.execute('UPDATE ir_attachment SET db_datas = %s WHERE id = %s',
 418                 (out, file_node.file_id))
 419         elif boo.type == 'realstore':
 420             try:
 421                 file_node.fix_ppath(cr, ira)
 422                 npath = file_node.full_path() or []
 423                 # npath may contain empty elements, for root directory etc.
 424                 for i, n in enumerate(npath):
 425                     if n == None:
 426                         del npath[i]
 427                 for n in npath:
 428                     for ch in ('*', '|', "\\", '/', ':', '"', '<', '>', '?', '..'):
 429                         if ch in n:
 430                             raise ValueError("Invalid char %s in path %s" %(ch, n))
 431                 dpath = [boo.path,]
 432                 dpath += npath[:-1]
 433                 path = os.path.join(*dpath)
 434                 if not os.path.isdir(path):
 435                     os.makedirs(path)
 436                 fname = os.path.join(path, npath[-1])
 437                 fp = file(fname,'wb')
 438                 fp.write(data)
 439                 fp.close()
 440                 self._doclog.debug("Saved data to %s", fname)
 441                 filesize = len(data) # os.stat(fname).st_size
 442                 store_fname = os.path.join(*npath)
 443                 # TODO Here, an old file would be left hanging.
 444             except Exception,e :
 445                 self._doclog.warning("Couldn't save data:", exc_info=True)
 446                 raise except_orm(_('Error!'), str(e))
 447
 448         elif boo.type == 'virtual':
 449             raise ValueError('Virtual storage does not support static files')
 450
 451         else:
 452             raise TypeError("No %s storage" % boo.type)
 453
 454         # 2nd phase: store the metadata
 455         try:
 456             icont = ''
 457             mime = ira.file_type
 458             if not mime:
 459                 mime = ""
 460             try:
 461                 mime, icont = cntIndex.doIndex(data, ira.datas_fname,
 462                 ira.file_type or None, fname)
 463             except Exception:
 464                 self._doclog.debug('Cannot index file:', exc_info=True)
 465                 pass
 466
 467             try:
 468                 icont_u = ustr(icont)
 469             except UnicodeError:
 470                 icont_u = ''
 471
 472             # a hack: /assume/ that the calling write operation will not try
 473             # to write the fname and size, and update them in the db concurrently.
 474             # We cannot use a write() here, because we are already in one.
 475             cr.execute('UPDATE ir_attachment SET store_fname = %s, file_size = %s, index_content = %s, file_type = %s WHERE id = %s',
 476                 (store_fname, filesize, icont_u, mime, file_node.file_id))
 477             file_node.content_length = filesize
 478             file_node.content_type = mime
 479             return True
 480         except Exception, e :
 481             self._doclog.warning("Couldn't save data:", exc_info=True)
 482             # should we really rollback once we have written the actual data?
 483             # at the db case (only), that rollback would be safe
 484             raise except_orm(_('Error at doc write!'), str(e))
 485
 486     def prepare_unlink(self, cr, uid, storage_bo, fil_bo):
 487         """ Before we unlink a file (fil_boo), prepare the list of real
 488         files that have to be removed, too. """
 489
 490         if not storage_bo.online:
 491             raise RuntimeError('media offline')
 492
 493         if storage_bo.type == 'filestore':
 494             fname = fil_bo.store_fname
 495             if not fname:
 496                 return None
 497             path = storage_bo.path
 498             return (storage_bo.id, 'file', os.path.join(path, fname))
 499         elif storage_bo.type in ('db', 'db64'):
 500             return None
 501         elif storage_bo.type == 'realstore':
 502             fname = fil_bo.store_fname
 503             if not fname:
 504                 return None
 505             path = storage_bo.path
 506             return ( storage_bo.id, 'file', os.path.join(path, fname))
 507         else:
 508             raise TypeError("No %s storage" % storage_bo.type)
 509
 510     def do_unlink(self, cr, uid, unres):
 511         for id, ktype, fname in unres:
 512             if ktype == 'file':
 513                 try:
 514                     os.unlink(fname)
 515                 except Exception, e:
 516                     self._doclog.warning("Could not remove file %s, please remove manually.", fname, exc_info=True)
 517             else:
 518                 self._doclog.warning("Unknown unlink key %s" % ktype)
 519
 520         return True
 521
 522     def simple_rename(self, cr, uid, file_node, new_name, context=None):
 523         """ A preparation for a file rename.
 524             It will not affect the database, but merely check and perhaps
 525             rename the realstore file.
 526
 527             @return the dict of values that can safely be be stored in the db.
 528         """
 529         sbro = self.browse(cr, uid, file_node.storage_id, context=context)
 530         assert sbro, "The file #%d didn't provide storage" % file_node.file_id
 531
 532         if sbro.type in ('filestore', 'db', 'db64'):
 533             # nothing to do for a rename, allow to change the db field
 534             return { 'name': new_name, 'datas_fname': new_name }
 535         elif sbro.type == 'realstore':
 536             fname = fil_bo.store_fname
 537             if not fname:
 538                 return ValueError("Tried to rename a non-stored file")
 539             path = storage_bo.path
 540             oldpath = os.path.join(path, fname)
 541
 542             for ch in ('*', '|', "\\", '/', ':', '"', '<', '>', '?', '..'):
 543                 if ch in new_name:
 544                     raise ValueError("Invalid char %s in name %s" %(ch, new_name))
 545
 546             file_node.fix_ppath(cr, ira)
 547             npath = file_node.full_path() or []
 548             dpath = [path,]
 549             dpath.extend(npath[:-1])
 550             dpath.append(new_name)
 551             newpath = os.path.join(*dpath)
 552             # print "old, new paths:", oldpath, newpath
 553             os.rename(oldpath, newpath)
 554             return { 'name': new_name, 'datas_fname': new_name, 'store_fname': new_name }
 555         else:
 556             raise TypeError("No %s storage" % boo.type)
 557
 558
 559 document_storage()
 560
 561
 562 #eof