#!/usr/bin/python """ NAME xmltv_merge -- merge multiple XMLTV-files into one SYNOPSIS xmltv_merge xmltvfile [xmltvfile ...] DESCRIPTION xmltv_merge merges the contents of multiple XMLTV-files. This can be useful if you grab XMLTV-data from multiple sources and want to create one big file, perhaps for use with `mythfilldatabase'. It also tries to output a file which adheres to the XMLTV DTD. As such, it can also be used as a filter to create such files from non-adhering XMLTV files. xmltv_merge can - roughly - handle multiple XMLTV-files containing data for the same channel; it does so by merging all information for a particular program (with a simple algorithm: for each field in the program structure, keep the field with the longest string). However, it does so only if it can match the program across the different XMLTV-sources: if one source lists a program `A', and another source lists a program `B', both on the same start time on the same channel, xmltv_merge uses a couple of methods to determine if A is the same as B (see the similarity() function below). If they don't match, the program data won't be merged and the entry from the XMLTV-source which is listed first on the commandline `wins'. Also, xmltv_merge doesn't (yet) fix overlaps, gaps or differing start/end times for the same program. If one source lists program `A' as starting on 09:30, and another source lists the same program `A' as starting on 09:27, these both end up in the output as different programs. PREREQUISITES Apart from a decent Python installation (this script was developed using Python 2.5.1 on Mac OS X 10.5), you'll need the lxml module from http://codespeak.net/lxml/ TODO LIST - handle overlaps/gaps - make similarity thresholds configurable - code cleanups AUTHOR Robert Klep (robert AT klep DOT name) LICENSE Public domain. VERSION $Rev: 39 $ """ from lxml import etree, objectify from datetime import datetime import sys, os.path, re if len(sys.argv) < 2: print >> sys.stderr, "Use: %s xmltvfile [xmltvfile ...]" % os.path.basename(sys.argv[0]) sys.exit(1) channels = {} channellist = {} for idx, file in enumerate(sys.argv[1:]): print >> sys.stderr, "going to read '%s'..." % file try: xml = objectify.parse(file) except Exception, e: print >> sys.stderr, "Caught exception parsing '%s': %s" % (file, e) continue for c in xml.xpath("//channel"): if c.get('id') not in channels: channels[c.get('id')] = [] channels[c.get('id')].append(etree.tostring(c)) programs = xml.xpath("//programme") print >> sys.stderr, "\t...contains data for %s programs" % len(programs) for program in programs: channel = program.get("channel") start = program.get("start") if channel not in channellist: channellist[channel] = {} if start not in channellist[channel]: channellist[channel][start] = [] channellist[channel][start].append(program) def merge(first, second): for k, v in first.__dict__.items(): if k in second.__dict__: # handle lists if not isinstance(v, objectify.StringElement): merge(v, second.__dict__[k]) first[k] = v else: f = unicode(v) s = unicode(second.__dict__[k]) if len(s) > len(f): first[k] = second.__dict__[k] for k, v in second.__dict__.items(): if k not in first.__dict__: first[k] = v # an RE (built using Perl's Regexp::Trie) to match English, Dutch and # German stopwords stopwords = re.compile(r'(?u)\b(?:a(?:(?:bout|ls?|nd?|re|u(?:ch|[fs])|[fmst]))?|b(?:ei?|i(?:st?|[jn])|y)|com|d(?:a(?:(?:durch|her|rum|ss?|[nt]))?|e(?:(?:ine?|s(?:(?:halb|sen))?|[mnr]))?|i(?:e(?:s(?:e[rs])?)?|t)|o(?:ch|rt)|u(?:rch)?)|e(?:en|in(?:e[mnrs]?)?|u(?:er|re)|[nrs])|f(?:or|rom)|h(?:a(?:tte(?:(?:st|[nt]))?|d)|e[bmt]|i(?:er|j)|o[ew]|un)|i(?:ch|hre?|st?|[kmnt])|j(?:e(?:(?:de[mnrs]?|ne[rs]|tzt))?|a)|kan(?:n(?:st)?)?|la|m(?:achen|e(?:(?:ine?|[nt]))?|i[jt]|usst)|n(?:ach(?:dem)?|ein|icht|og|un?)|o(?:der|ns?|ok|[fr])|s(?:ei(?:ne?|d)|i(?:ch|nd|e)|o(?:ll(?:(?:en|st|t))?|nst|w(?:eit|ie)))|t(?:h(?:at|is|e)|ot?|e)|u(?:it|n(?:ser|ter|d))|v(?:an|o[mnr])|w(?:a(?:nn|rum|s)|e(?:(?:itere?|nn|r(?:de[nt]?)?|shalb|l))?|h(?:at|e(?:re|n)|o)|i(?:e(?:(?:der|so))?|ll|r(?:(?:st|d))?|th|j)|o(?:h(?:er|in))?|ww)|z(?:al|ei?|ij|ou?|u[mr]?)|I)\b', re.I) def similarity(str1, str2): # shortcut if str1 == str2: return 1.0 # sanitize strings first str1 = re.sub(r'(?u)\s+', ' ', str1.lower()) str2 = re.sub(r'(?u)\s+', ' ', str2.lower()) # remove ellipsis str1 = re.sub(r'(?u)\s+\S*\.\.\.$', '', str1) str2 = re.sub(r'(?u)\s+\S*\.\.\.$', '', str2) # remove stopwords str1 = stopwords.sub('', str1) str2 = stopwords.sub('', str2) # are strings substring of each other? stripped1 = re.sub(r'(?u)[\W]', '', str1) stripped2 = re.sub(r'(?u)[\W]', '', str2) if (stripped1 in stripped2) or (stripped2 in stripped1): return 1.0 # or perhaps an abbreviation? abbr1 = re.sub(r'(?u)\b(\S)\S*', r'\1', str1).replace(' ', '') abbr2 = re.sub(r'(?u)\b(\S)\S*', r'\1', str2).replace(' ', '') if abbr1 == str2 or abbr2 == str1: return 1.0 # remove accented characters and vowels stripped1 = re.sub(r'[\?aeiouy]', '', str1.encode('ascii', 'replace')) stripped2 = re.sub(r'[\?aeiouy]', '', str2.encode('ascii', 'replace')) # count word-overlap stripped1 = re.sub(r'(?u)[^\w\s]', '', stripped1) stripped2 = re.sub(r'(?u)[^\w\s]', '', stripped2) set1 = set([ i for i in stripped1.split(" ") ]) set2 = set([ i for i in stripped2.split(" ") ]) cmn = set1 & set2 return len(cmn) / float(min(len(set1), len(set2))) programs = [] for channelkey in sorted(channellist, cmp): programkeys = sorted(channellist[channelkey]) for idx, programkey in enumerate(programkeys): programlist = channellist[channelkey][programkey] first = programlist.pop(0) while len(programlist): second = programlist.pop(0) title1 = unicode(first['title']) title2 = unicode(second['title']) if similarity(title1, title2) >= 0.75: merge(first, second) continue print >> sys.stderr, "titles don't match ('%s' != '%s')" % (title1.encode('utf-8'), title2.encode('utf-8')) try: desc1 = unicode(first['desc']) desc2 = unicode(second['desc']) if similarity(desc1, desc2) >= 0.70: print >> sys.stderr, "\tdid match descriptions" merge(first, second) continue except: pass print >> sys.stderr, "\tcan't match descriptions either" try: subtitle1 = unicode(first['sub-title']) subtitle2 = unicode(second['sub-title']) if similarity(subtitle1, subtitle2) >= 0.60: print >> sys.stderr, "\tdid match sub-title" merge(first, second) continue except: pass print >> sys.stderr, "\tcan't match sub-title either, giving up..." sortedlist = [] for kw in [ 'title', 'sub-title', 'desc', 'credits', 'date', 'category', 'language', 'orig-language', 'length', 'icon', 'url', 'country', 'episode-num', 'video', 'audio', 'previously-shown', 'premiere', 'last-chance', 'new', 'subtitles', 'rating', 'star-rating' ]: try: # TODO: