3166 feed generation needs performance improvement
3306 feed returns invalid last-modified header

   1 #!/usr/bin/python2.4
   2 #
   3 # CDDL HEADER START
   4 #
   5 # The contents of this file are subject to the terms of the
   6 # Common Development and Distribution License (the "License").
   7 # You may not use this file except in compliance with the License.
   8 #
   9 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  10 # or http://www.opensolaris.org/os/licensing.
  11 # See the License for the specific language governing permissions
  12 # and limitations under the License.
  13 #
  14 # When distributing Covered Code, include this CDDL HEADER in each
  15 # file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  16 # If applicable, add the following below this CDDL HEADER, with the
  17 # fields enclosed by brackets "[]" replaced with your own identifying
  18 # information: Portions Copyright [yyyy] [name of copyright owner]
  19 #
  20 # CDDL HEADER END
  21 #
  22 # Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  23 # Use is subject to license terms.
  24 
  25 """feed - routines for generating RFC 4287 Atom feeds for packaging server
  26 
  27    At present, the pkg.server.feed module provides a set of routines that, from
  28    a catalog, allow the construction of a feed representing the activity within
  29    a given time period."""
  30 
  31 import cherrypy
  32 from cherrypy.lib.static import serve_file
  33 import cStringIO
  34 import datetime
  35 import httplib
  36 import os
  37 import rfc822

  38 import time
  39 import urllib
  40 import xml.dom.minidom as xmini
  41 
  42 from pkg.misc import get_rel_path, get_res_path
  43 import pkg.catalog as catalog
  44 import pkg.fmri as fmri
  45 import pkg.Uuid25 as uuid
  46 
  47 MIME_TYPE = 'application/atom+xml'
  48 CACHE_FILENAME = "feed.xml"
  49 RFC3339_FMT = "%Y-%m-%dT%H:%M:%SZ"
  50 
  51 def dt_to_rfc3339_str(ts):
  52         """Returns a string representing a datetime object formatted according
  53         to RFC 3339.
  54         """
  55         return ts.strftime(RFC3339_FMT)
  56 
  57 def rfc3339_str_to_ts(ts_str):
  58         """Returns a timestamp representing 'ts_str', which should be in the
  59         format specified by RFC 3339.
  60         """
  61         return time.mktime(time.strptime(ts_str, RFC3339_FMT))
  62 
  63 def rfc3339_str_to_dt(ts_str):
  64         """Returns a datetime object representing 'ts_str', which should be in
  65         the format specified by RFC 3339.
  66         """
  67         return datetime.datetime(*time.strptime(ts_str, RFC3339_FMT)[0:6])
  68 
  69 def ults_to_ts(ts_str):
  70         """Returns a timestamp representing 'ts_str', which should be in
  71         updatelog format.
  72         """
  73         # Python doesn't support fractional seconds for strptime.
  74         ts_str = ts_str.split('.')[0]
  75         # Currently, updatelog entries are in local time, not UTC.
  76         return time.mktime(time.strptime(ts_str, "%Y-%m-%dT%H:%M:%S"))
  77 
  78 def ults_to_rfc3339_str(ts_str):
  79         """Returns a timestamp representing 'ts_str', which should be in
  80         updatelog format.
  81         """
  82         ltime = ults_to_ts(ts_str)
  83         # Currently, updatelog entries are in local time, not UTC.
  84         return dt_to_rfc3339_str(datetime.datetime(
  85             *time.gmtime(ltime)[0:6]))
  86 
  87 def fmri_to_taguri(rcfg, f):
  88         """Generates a 'tag' uri compliant with RFC 4151.  Visit
  89         http://www.taguri.org/ for more information.
  90         """
  91         return "tag:%s,%s:%s" % (rcfg.get_attribute("feed",
  92             "authority"), f.get_timestamp().strftime("%Y-%m-%d"),
  93             urllib.unquote(f.get_url_path()))
  94 
  95 def init(scfg, rcfg):
  96         """This function performs general initialization work that is needed
  97         for feeds to work correctly.
  98         """
  99 
 100         if not scfg.is_read_only():
 101                 # RSS/Atom feeds require a unique identifier, so
 102                 # generate one if isn't defined already.  This
 103                 # needs to be a persistent value, so we only
 104                 # generate this if we can save the configuration.
 105                 fid = rcfg.get_attribute("feed", "id")
 106                 if not fid:
 107                         # Create a random UUID (type 4).
 108                         rcfg._set_attribute("feed", "id", uuid.uuid4())
 109 
 110                 # Ensure any configuration changes are reflected in the feed.
 111                 __clear_cache(scfg)
 112 
 113 def set_title(request, rcfg, doc, feed, update_ts):
 114         """This function attaches the necessary RSS/Atom feed elements needed
 115         to provide title, author and contact information to the provided
 116         xmini document object using the provided feed object and update
 117         time.
 118         """
 119 
 120         t = doc.createElement("title")
 121         ti = xmini.Text()
 122         ti.replaceWholeText(rcfg.get_attribute("feed", "name"))
 123         t.appendChild(ti)
 124         feed.appendChild(t)
 125 
 126         l = doc.createElement("link")
 127         l.setAttribute("href", cherrypy.url())
 128         l.setAttribute("rel", "self")
 129         feed.appendChild(l)
 130 
 131         # Atom requires each feed to have a permanent, universally unique
 132         # identifier.
 133         i = doc.createElement("id")
 134         it = xmini.Text()
 135         it.replaceWholeText("urn:uuid:%s" % rcfg.get_attribute("feed", "id"))
 136         i.appendChild(it)
 137         feed.appendChild(i)
 138 
 139         # Indicate when the feed was last updated.
 140         u = doc.createElement("updated")
 141         ut = xmini.Text()
 142         ut.replaceWholeText(dt_to_rfc3339_str(update_ts))
 143         u.appendChild(ut)
 144         feed.appendChild(u)
 145 
 146         # Add our icon.
 147         i = doc.createElement("icon")
 148         it = xmini.Text()
 149         it.replaceWholeText(get_res_path(request, rcfg.get_attribute(
 150             "feed", "icon")))
 151         i.appendChild(it)
 152         feed.appendChild(i)
 153 
 154         # Add our logo.
 155         l = doc.createElement("logo")
 156         lt = xmini.Text()
 157         lt.replaceWholeText(get_res_path(request, rcfg.get_attribute(
 158             "feed", "logo")))
 159         l.appendChild(lt)
 160         feed.appendChild(l)
 161 
 162         maintainer = rcfg.get_attribute("repository", "maintainer")
 163         # The author information isn't required, but can be useful.
 164         if maintainer:
 165                 name, email = rfc822.AddressList(maintainer).addresslist[0]
 166 
 167                 if email and not name:
 168                         # If we got an email address, but no name, then
 169                         # the name was likely parsed as a local address. In
 170                         # that case, assume the whole string is the name.
 171                         name = maintainer
 172                         email = None
 173 
 174                 a = doc.createElement("author")
 175 
 176                 # First we have to add a name element. This is required if an
 177                 # author element exists.
 178                 n = doc.createElement("name")
 179                 nt = xmini.Text()
 180                 nt.replaceWholeText(name)
 181                 n.appendChild(nt)
 182                 a.appendChild(n)
 183 
 184                 if email:
 185                         # If we were able to extract an email address from the
 186                         # maintainer information, add the optional email
 187                         # element to provide a point of communication.
 188                         e = doc.createElement("email")
 189                         et = xmini.Text()
 190                         et.replaceWholeText(email)
 191                         e.appendChild(et)
 192                         a.appendChild(e)
 193 
 194                 # Done with the author.
 195                 feed.appendChild(a)
 196 
 197 operations = {
 198         "+": ["Added", "%s was added to the repository."],
 199         "-": ["Removed", "%s was removed from the repository."],
 200         "U": ["Updated", "%s, an update to an existing package, was added to "
 201             "the repository."]
 202 }
 203 
 204 def add_transaction(request, scfg, rcfg, doc, feed, txn):
 205         """Each transaction is an entry.  We have non-trivial content, so we
 206         can omit summary elements.
 207         """
 208 
 209         e = doc.createElement("entry")
 210 
 211         tag, fmri_str = txn["catalog"].split()
 212         f = fmri.PkgFmri(fmri_str)
 213  
 214         # Generate a 'tag' uri, to uniquely identify the entry, using the fmri.
 215         i = xmini.Text()
 216         i.replaceWholeText(fmri_to_taguri(rcfg, f))
 217         eid = doc.createElement("id")
 218         eid.appendChild(i)
 219         e.appendChild(eid)
 220 
 221         # Attempt to determine the operation that was performed and generate
 222         # the entry title and content.
 223         if txn["operation"] in operations:
 224                 op_title, op_content = operations[txn["operation"]]
 225         else:
 226                 # XXX Better way to reflect an error?  (Aborting will make a
 227                 # non-well-formed document.)
 228                 op_title = "Unknown Operation"
 229                 op_content = "%s was changed in the repository."
 230 
 231         if txn["operation"] == "+":
 232                 c = scfg.updatelog.catalog
 233                 # Get all FMRIs matching the current FMRI's package name.
 234                 matches = catalog.extract_matching_fmris(c.fmris(),
 235                     f.get_name(), matcher=fmri.exact_name_match)
 236 
 237                 if len(matches) > 1:
 238                         # Get the oldest fmri (it's the last entry).
 239                         of = matches[-1]
 240 
 241                         # If the current fmri isn't the oldest one, then this
 242                         # is an update to the package.
 243                         if f != of:
 244                                 # If there is more than one matching FMRI, and
 245                                 # it isn't the same version as the oldest one,
 246                                 # we can assume that this is an update to an
 247                                 # existing package.
 248                                 op_title, op_content = operations["U"]
 249 
 250         # Now add a title for our entry.
 251         etitle = doc.createElement("title")
 252         ti = xmini.Text()
 253         ti.replaceWholeText(" ".join([op_title, fmri_str]))
 254         etitle.appendChild(ti)
 255         e.appendChild(etitle)
 256 
 257         # Indicate when the entry was last updated (in this case, when the
 258         # package was added).
 259         eu = doc.createElement("updated")
 260         ut = xmini.Text()
 261         ut.replaceWholeText(ults_to_rfc3339_str(txn["timestamp"]))
 262         eu.appendChild(ut)
 263         e.appendChild(eu)
 264 
 265         # Link to the info output for the given package FMRI.
 266         e_uri = get_rel_path(request, 'info/0/%s' % f.get_url_path())
 267 
 268         l = doc.createElement("link")
 269         l.setAttribute("rel", "alternate")
 270         l.setAttribute("href", e_uri)
 271         e.appendChild(l)
 272 
 273         # Using the description for the operation performed, add the FMRI and
 274         # tag information.
 275         content_text = op_content % fmri_str
 276         if tag == "C":
 277                 content_text += "  This version is tagged as critical."
 278 
 279         co = xmini.Text()
 280         co.replaceWholeText(content_text)
 281         ec = doc.createElement("content")
 282         ec.appendChild(co)
 283         e.appendChild(ec)
 284 
 285         feed.appendChild(e)
 286 
 287 def update(request, scfg, rcfg, t, cf):
 288         """Generate new Atom document for current updates.  The cached feed
 289         file is written to scfg.repo_root/CACHE_FILENAME.
 290         """
 291 
 292         # Our configuration is stored in hours, convert it to seconds.
 293         window_seconds = rcfg.get_attribute("feed", "window") * 60 * 60
 294         feed_ts = datetime.datetime.fromtimestamp(t - window_seconds)
 295 
 296         d = xmini.Document()
 297 
 298         feed = d.createElementNS("http://www.w3.org/2005/Atom", "feed")
 299         feed.setAttribute("xmlns", "http://www.w3.org/2005/Atom")
 300 
 301         set_title(request, rcfg, d, feed, scfg.updatelog.last_update)
 302 
 303         d.appendChild(feed)
 304 
 305         # The feed should be presented in reverse chronological order.
 306         def compare_ul_entries(a, b):
 307                 return cmp(ults_to_ts(a["timestamp"]),
 308                     ults_to_ts(b["timestamp"]))
 309 






 310         for txn in sorted(scfg.updatelog.gen_updates_as_dictionaries(feed_ts),
 311             cmp=compare_ul_entries, reverse=True):
 312                 add_transaction(request, scfg, rcfg, d, feed, txn)
 313 
 314         d.writexml(cf)
 315 
 316 def __get_cache_pathname(scfg):
 317         return os.path.join(scfg.repo_root, CACHE_FILENAME)
 318 
 319 def __clear_cache(scfg):
 320         if scfg.is_read_only():
 321                 # Ignore the request due to server configuration.
 322                 return
 323 
 324         pathname = __get_cache_pathname(scfg)
 325         try:
 326                 if os.path.exists(pathname):
 327                         os.remove(pathname)
 328         except IOError:
 329                 raise cherrypy.HTTPError(
 330                     httplib.INTERNAL_SERVER_ERROR,
 331                     "Unable to clear feed cache.")
 332 
 333 def __cache_needs_update(scfg):
 334         """Checks to see if the feed cache file exists and if it is still
 335         valid.  Returns False, None if the cache is valid or True, last
 336         where last is a timestamp representing when the cache was
 337         generated.
 338         """
 339         cfpath = __get_cache_pathname(scfg)
 340         last = None
 341         need_update = True
 342         if os.path.isfile(cfpath):
 343                 # Attempt to parse the cached copy.  If we can't, for any
 344                 # reason, assume we need to remove it and start over.
 345                 try:
 346                         d = xmini.parse(cfpath)
 347                 except Exception:
 348                         d = None
 349                         __clear_cache(scfg)
 350 
 351                 # Get the feed element and attempt to get the time we last
 352                 # generated the feed to determine whether we need to regenerate
 353                 # it.  If for some reason we can't get that information, assume
 354                 # the cache is invalid, clear it, and force regeneration.
 355                 fe = None
 356                 if d:
 357                         fe = d.childNodes[0]
 358 
 359                 if fe:
 360                         utn = None
 361                         for cnode in fe.childNodes:
 362                                 if cnode.nodeName == "updated":
 363                                         utn = cnode.childNodes[0]
 364                                         break
 365 
 366                         if utn:
 367                                 last_ts = rfc3339_str_to_dt(utn.nodeValue)
 368 
 369                                 # Since our feed cache and updatelog might have
 370                                 # been created within the same second, we need
 371                                 # to ignore small variances when determining
 372                                 # whether to update the feed cache.
 373                                 update_ts = scfg.updatelog.last_update.replace(
 374                                     microsecond=0)
 375 
 376                                 if last_ts >= update_ts:
 377                                         need_update = False
 378                                 else:
 379                                         last = rfc3339_str_to_ts(utn.nodeValue)
 380                         else:
 381                                 __clear_cache(scfg)
 382                 else:
 383                         __clear_cache(scfg)
 384 
 385         return need_update, last
 386 
 387 def handle(scfg, rcfg, request, response):
 388         """If there have been package updates since we last generated the feed,
 389         update the feed and send it to the client.  Otherwise, send them the
 390         cached copy if it is available.
 391         """
 392 
 393         cfpath = __get_cache_pathname(scfg)
 394 
 395         # First check to see if we already have a valid cache of the feed.
 396         need_update, last = __cache_needs_update(scfg)
 397 
 398         if need_update:
 399                 # Update always looks at feed.window seconds before the last
 400                 # update until "now."  If last is none, we want it to use "now"
 401                 # as its starting point.
 402                 if last is None:
 403                         last = time.time()
 404 
 405                 if scfg.is_read_only():
 406                         # If the server is operating in readonly mode, the
 407                         # feed will have to be generated every time.
 408                         cf = cStringIO.StringIO()
 409                         update(request, scfg, rcfg, last, cf)
 410                         cf.seek(0)
 411                         buf = cf.read()
 412                         cf.close()
 413 
 414                         # Now that the feed has been generated, set the headers
 415                         # correctly and return it.
 416                         response.headers['Content-type'] = MIME_TYPE
 417                         response.headers['Last-Modified'] = \
 418                             datetime.datetime.now().isoformat()


 419                         response.headers['Content-length'] = len(buf)
 420                         return buf
 421                 else:
 422                         # If the server isn't operating in readonly mode, the
 423                         # feed can be generated and cached in inst_dir.
 424                         cf = file(cfpath, "w")
 425                         update(request, scfg, rcfg, last, cf)
 426                         cf.close()
 427 
 428         return serve_file(cfpath, MIME_TYPE)
 429 
--- EOF ---