3166 feed generation needs performance improvement
3306 feed returns invalid last-modified header

   1 #!/usr/bin/python2.4
   2 #
   3 # CDDL HEADER START
   4 #
   5 # The contents of this file are subject to the terms of the
   6 # Common Development and Distribution License (the "License").
   7 # You may not use this file except in compliance with the License.
   8 #
   9 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  10 # or http://www.opensolaris.org/os/licensing.
  11 # See the License for the specific language governing permissions
  12 # and limitations under the License.
  13 #
  14 # When distributing Covered Code, include this CDDL HEADER in each
  15 # file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  16 # If applicable, add the following below this CDDL HEADER, with the
  17 # fields enclosed by brackets "[]" replaced with your own identifying
  18 # information: Portions Copyright [yyyy] [name of copyright owner]
  19 #
  20 # CDDL HEADER END
  21 #
  22 # Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  23 # Use is subject to license terms.
  24 
  25 """feed - routines for generating RFC 4287 Atom feeds for packaging server
  26 
  27    At present, the pkg.server.feed module provides a set of routines that, from
  28    a catalog, allow the construction of a feed representing the activity within
  29    a given time period."""
  30 
  31 import cherrypy
  32 from cherrypy.lib.static import serve_file
  33 import cStringIO
  34 import datetime
  35 import httplib
  36 import os
  37 import rfc822
  38 import sys
  39 import time
  40 import urllib
  41 import xml.dom.minidom as xmini
  42 
  43 from pkg.misc import get_rel_path, get_res_path
  44 import pkg.server.catalog as catalog
  45 import pkg.fmri as fmri
  46 import pkg.Uuid25 as uuid
  47 
  48 MIME_TYPE = 'application/atom+xml'
  49 CACHE_FILENAME = "feed.xml"
  50 RFC3339_FMT = "%Y-%m-%dT%H:%M:%SZ"
  51 
  52 def dt_to_rfc3339_str(ts):
  53         """Returns a string representing a datetime object formatted according
  54         to RFC 3339.
  55         """
  56         return ts.strftime(RFC3339_FMT)
  57 
  58 def rfc3339_str_to_ts(ts_str):
  59         """Returns a timestamp representing 'ts_str', which should be in the
  60         format specified by RFC 3339.
  61         """
  62         return time.mktime(time.strptime(ts_str, RFC3339_FMT))
  63 
  64 def rfc3339_str_to_dt(ts_str):
  65         """Returns a datetime object representing 'ts_str', which should be in
  66         the format specified by RFC 3339.
  67         """
  68         return datetime.datetime(*time.strptime(ts_str, RFC3339_FMT)[0:6])
  69 
  70 def ults_to_ts(ts_str):
  71         """Returns a timestamp representing 'ts_str', which should be in
  72         updatelog format.
  73         """
  74         # Python doesn't support fractional seconds for strptime.
  75         ts_str = ts_str.split('.')[0]
  76         # Currently, updatelog entries are in local time, not UTC.
  77         return time.mktime(time.strptime(ts_str, "%Y-%m-%dT%H:%M:%S"))
  78 
  79 def ults_to_rfc3339_str(ts_str):
  80         """Returns a timestamp representing 'ts_str', which should be in
  81         updatelog format.
  82         """
  83         ltime = ults_to_ts(ts_str)
  84         # Currently, updatelog entries are in local time, not UTC.
  85         return dt_to_rfc3339_str(datetime.datetime(
  86             *time.gmtime(ltime)[0:6]))
  87 
  88 def fmri_to_taguri(rcfg, f):
  89         """Generates a 'tag' uri compliant with RFC 4151.  Visit
  90         http://www.taguri.org/ for more information.
  91         """
  92         return "tag:%s,%s:%s" % (rcfg.get_attribute("feed",
  93             "authority"), f.get_timestamp().strftime("%Y-%m-%d"),
  94             urllib.unquote(f.get_url_path()))
  95 
  96 def init(scfg, rcfg):
  97         """This function performs general initialization work that is needed
  98         for feeds to work correctly.
  99         """
 100 
 101         if not scfg.is_read_only():
 102                 # RSS/Atom feeds require a unique identifier, so
 103                 # generate one if isn't defined already.  This
 104                 # needs to be a persistent value, so we only
 105                 # generate this if we can save the configuration.
 106                 fid = rcfg.get_attribute("feed", "id")
 107                 if not fid:
 108                         # Create a random UUID (type 4).
 109                         rcfg._set_attribute("feed", "id", uuid.uuid4())
 110 
 111                 # Ensure any configuration changes are reflected in the feed.
 112                 __clear_cache(scfg)
 113 
 114 def set_title(request, rcfg, doc, feed, update_ts):
 115         """This function attaches the necessary RSS/Atom feed elements needed
 116         to provide title, author and contact information to the provided
 117         xmini document object using the provided feed object and update
 118         time.
 119         """
 120 
 121         t = doc.createElement("title")
 122         ti = xmini.Text()
 123         ti.replaceWholeText(rcfg.get_attribute("feed", "name"))
 124         t.appendChild(ti)
 125         feed.appendChild(t)
 126 
 127         l = doc.createElement("link")
 128         l.setAttribute("href", cherrypy.url())
 129         l.setAttribute("rel", "self")
 130         feed.appendChild(l)
 131 
 132         # Atom requires each feed to have a permanent, universally unique
 133         # identifier.
 134         i = doc.createElement("id")
 135         it = xmini.Text()
 136         it.replaceWholeText("urn:uuid:%s" % rcfg.get_attribute("feed", "id"))
 137         i.appendChild(it)
 138         feed.appendChild(i)
 139 
 140         # Indicate when the feed was last updated.
 141         u = doc.createElement("updated")
 142         ut = xmini.Text()
 143         ut.replaceWholeText(dt_to_rfc3339_str(update_ts))
 144         u.appendChild(ut)
 145         feed.appendChild(u)
 146 
 147         # Add our icon.
 148         i = doc.createElement("icon")
 149         it = xmini.Text()
 150         it.replaceWholeText(get_res_path(request, rcfg.get_attribute(
 151             "feed", "icon")))
 152         i.appendChild(it)
 153         feed.appendChild(i)
 154 
 155         # Add our logo.
 156         l = doc.createElement("logo")
 157         lt = xmini.Text()
 158         lt.replaceWholeText(get_res_path(request, rcfg.get_attribute(
 159             "feed", "logo")))
 160         l.appendChild(lt)
 161         feed.appendChild(l)
 162 
 163         maintainer = rcfg.get_attribute("repository", "maintainer")
 164         # The author information isn't required, but can be useful.
 165         if maintainer:
 166                 name, email = rfc822.AddressList(maintainer).addresslist[0]
 167 
 168                 if email and not name:
 169                         # If we got an email address, but no name, then
 170                         # the name was likely parsed as a local address. In
 171                         # that case, assume the whole string is the name.
 172                         name = maintainer
 173                         email = None
 174 
 175                 a = doc.createElement("author")
 176 
 177                 # First we have to add a name element. This is required if an
 178                 # author element exists.
 179                 n = doc.createElement("name")
 180                 nt = xmini.Text()
 181                 nt.replaceWholeText(name)
 182                 n.appendChild(nt)
 183                 a.appendChild(n)
 184 
 185                 if email:
 186                         # If we were able to extract an email address from the
 187                         # maintainer information, add the optional email
 188                         # element to provide a point of communication.
 189                         e = doc.createElement("email")
 190                         et = xmini.Text()
 191                         et.replaceWholeText(email)
 192                         e.appendChild(et)
 193                         a.appendChild(e)
 194 
 195                 # Done with the author.
 196                 feed.appendChild(a)
 197 
 198 operations = {
 199         "+": ["Added", "%s was added to the repository."],
 200         "-": ["Removed", "%s was removed from the repository."],
 201         "U": ["Updated", "%s, an update to an existing package, was added to "
 202             "the repository."]
 203 }
 204 
 205 def add_transaction(request, scfg, rcfg, doc, feed, txn, fmris):
 206         """Each transaction is an entry.  We have non-trivial content, so we
 207         can omit summary elements.
 208         """
 209 
 210         e = doc.createElement("entry")
 211 
 212         tag, fmri_str = txn["catalog"].split()
 213         f = fmri.PkgFmri(fmri_str)
 214  
 215         # Generate a 'tag' uri, to uniquely identify the entry, using the fmri.
 216         i = xmini.Text()
 217         i.replaceWholeText(fmri_to_taguri(rcfg, f))
 218         eid = doc.createElement("id")
 219         eid.appendChild(i)
 220         e.appendChild(eid)
 221 
 222         # Attempt to determine the operation that was performed and generate
 223         # the entry title and content.
 224         if txn["operation"] in operations:
 225                 op_title, op_content = operations[txn["operation"]]
 226         else:
 227                 # XXX Better way to reflect an error?  (Aborting will make a
 228                 # non-well-formed document.)
 229                 op_title = "Unknown Operation"
 230                 op_content = "%s was changed in the repository."
 231 
 232         if txn["operation"] == "+":

 233                 # Get all FMRIs matching the current FMRI's package name.
 234                 matches = fmris[f.pkg_name]
 235                 if len(matches["versions"]) > 1:
 236                         # Get the oldest fmri.
 237                         of = matches[str(matches["versions"][0])][0]


 238 
 239                         # If the current fmri isn't the oldest one, then this
 240                         # is an update to the package.
 241                         if f != of:
 242                                 # If there is more than one matching FMRI, and
 243                                 # it isn't the same version as the oldest one,
 244                                 # we can assume that this is an update to an
 245                                 # existing package.
 246                                 op_title, op_content = operations["U"]
 247 
 248         # Now add a title for our entry.
 249         etitle = doc.createElement("title")
 250         ti = xmini.Text()
 251         ti.replaceWholeText(" ".join([op_title, fmri_str]))
 252         etitle.appendChild(ti)
 253         e.appendChild(etitle)
 254 
 255         # Indicate when the entry was last updated (in this case, when the
 256         # package was added).
 257         eu = doc.createElement("updated")
 258         ut = xmini.Text()
 259         ut.replaceWholeText(ults_to_rfc3339_str(txn["timestamp"]))
 260         eu.appendChild(ut)
 261         e.appendChild(eu)
 262 
 263         # Link to the info output for the given package FMRI.
 264         e_uri = get_rel_path(request, 'info/0/%s' % f.get_url_path())
 265 
 266         l = doc.createElement("link")
 267         l.setAttribute("rel", "alternate")
 268         l.setAttribute("href", e_uri)
 269         e.appendChild(l)
 270 
 271         # Using the description for the operation performed, add the FMRI and
 272         # tag information.
 273         content_text = op_content % fmri_str
 274         if tag == "C":
 275                 content_text += "  This version is tagged as critical."
 276 
 277         co = xmini.Text()
 278         co.replaceWholeText(content_text)
 279         ec = doc.createElement("content")
 280         ec.appendChild(co)
 281         e.appendChild(ec)
 282 
 283         feed.appendChild(e)
 284 
 285 def update(request, scfg, rcfg, t, cf):
 286         """Generate new Atom document for current updates.  The cached feed
 287         file is written to scfg.repo_root/CACHE_FILENAME.
 288         """
 289 
 290         # Our configuration is stored in hours, convert it to seconds.
 291         window_seconds = rcfg.get_attribute("feed", "window") * 60 * 60
 292         feed_ts = datetime.datetime.fromtimestamp(t - window_seconds)
 293 
 294         d = xmini.Document()
 295 
 296         feed = d.createElementNS("http://www.w3.org/2005/Atom", "feed")
 297         feed.setAttribute("xmlns", "http://www.w3.org/2005/Atom")
 298 
 299         set_title(request, rcfg, d, feed, scfg.updatelog.last_update)
 300 
 301         d.appendChild(feed)
 302 
 303         # The feed should be presented in reverse chronological order.
 304         def compare_ul_entries(a, b):
 305                 return cmp(ults_to_ts(a["timestamp"]),
 306                     ults_to_ts(b["timestamp"]))
 307 
 308         # Get the entire catalog in the format returned by catalog.cache_fmri,
 309         # so that we don't have to keep looking for possible matches.
 310         fmris = {}
 311         catalog.ServerCatalog.read_catalog(fmris,
 312             scfg.updatelog.catalog.catalog_root)
 313 
 314         for txn in sorted(scfg.updatelog.gen_updates_as_dictionaries(feed_ts),
 315             cmp=compare_ul_entries, reverse=True):
 316                 add_transaction(request, scfg, rcfg, d, feed, txn, fmris)
 317 
 318         d.writexml(cf)
 319 
 320 def __get_cache_pathname(scfg):
 321         return os.path.join(scfg.repo_root, CACHE_FILENAME)
 322 
 323 def __clear_cache(scfg):
 324         if scfg.is_read_only():
 325                 # Ignore the request due to server configuration.
 326                 return
 327 
 328         pathname = __get_cache_pathname(scfg)
 329         try:
 330                 if os.path.exists(pathname):
 331                         os.remove(pathname)
 332         except IOError:
 333                 raise cherrypy.HTTPError(
 334                     httplib.INTERNAL_SERVER_ERROR,
 335                     "Unable to clear feed cache.")
 336 
 337 def __cache_needs_update(scfg):
 338         """Checks to see if the feed cache file exists and if it is still
 339         valid.  Returns False, None if the cache is valid or True, last
 340         where last is a timestamp representing when the cache was
 341         generated.
 342         """
 343         cfpath = __get_cache_pathname(scfg)
 344         last = None
 345         need_update = True
 346         if os.path.isfile(cfpath):
 347                 # Attempt to parse the cached copy.  If we can't, for any
 348                 # reason, assume we need to remove it and start over.
 349                 try:
 350                         d = xmini.parse(cfpath)
 351                 except Exception:
 352                         d = None
 353                         __clear_cache(scfg)
 354 
 355                 # Get the feed element and attempt to get the time we last
 356                 # generated the feed to determine whether we need to regenerate
 357                 # it.  If for some reason we can't get that information, assume
 358                 # the cache is invalid, clear it, and force regeneration.
 359                 fe = None
 360                 if d:
 361                         fe = d.childNodes[0]
 362 
 363                 if fe:
 364                         utn = None
 365                         for cnode in fe.childNodes:
 366                                 if cnode.nodeName == "updated":
 367                                         utn = cnode.childNodes[0]
 368                                         break
 369 
 370                         if utn:
 371                                 last_ts = rfc3339_str_to_dt(utn.nodeValue)
 372 
 373                                 # Since our feed cache and updatelog might have
 374                                 # been created within the same second, we need
 375                                 # to ignore small variances when determining
 376                                 # whether to update the feed cache.
 377                                 update_ts = scfg.updatelog.last_update.replace(
 378                                     microsecond=0)
 379 
 380                                 if last_ts >= update_ts:
 381                                         need_update = False
 382                                 else:
 383                                         last = rfc3339_str_to_ts(utn.nodeValue)
 384                         else:
 385                                 __clear_cache(scfg)
 386                 else:
 387                         __clear_cache(scfg)
 388 
 389         return need_update, last
 390 
 391 def handle(scfg, rcfg, request, response):
 392         """If there have been package updates since we last generated the feed,
 393         update the feed and send it to the client.  Otherwise, send them the
 394         cached copy if it is available.
 395         """
 396 
 397         cfpath = __get_cache_pathname(scfg)
 398 
 399         # First check to see if we already have a valid cache of the feed.
 400         need_update, last = __cache_needs_update(scfg)
 401 
 402         if need_update:
 403                 # Update always looks at feed.window seconds before the last
 404                 # update until "now."  If last is none, we want it to use "now"
 405                 # as its starting point.
 406                 if last is None:
 407                         last = time.time()
 408 
 409                 if scfg.is_read_only():
 410                         # If the server is operating in readonly mode, the
 411                         # feed will have to be generated every time.
 412                         cf = cStringIO.StringIO()
 413                         update(request, scfg, rcfg, last, cf)
 414                         cf.seek(0)
 415                         buf = cf.read()
 416                         cf.close()
 417 
 418                         # Now that the feed has been generated, set the headers
 419                         # correctly and return it.
 420                         response.headers['Content-type'] = MIME_TYPE
 421 
 422                         # Return the current time and date in GMT.
 423                         response.headers['Last-Modified'] = rfc822.formatdate()
 424 
 425                         response.headers['Content-length'] = len(buf)
 426                         return buf
 427                 else:
 428                         # If the server isn't operating in readonly mode, the
 429                         # feed can be generated and cached in inst_dir.
 430                         cf = file(cfpath, "w")
 431                         update(request, scfg, rcfg, last, cf)
 432                         cf.close()
 433 
 434         return serve_file(cfpath, MIME_TYPE)
 435 
--- EOF ---