token-delta.py 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252
  1. #!/usr/bin/env python
  2. from __future__ import absolute_import, division, print_function
  3. import os
  4. import re
  5. import subprocess
  6. import sys
  7. import tempfile
  8. ###
  9. class DeltaAlgorithm(object):
  10. def __init__(self):
  11. self.cache = set()
  12. def test(self, changes):
  13. abstract
  14. ###
  15. def getTestResult(self, changes):
  16. # There is no reason to cache successful tests because we will
  17. # always reduce the changeset when we see one.
  18. changeset = frozenset(changes)
  19. if changeset in self.cache:
  20. return False
  21. elif not self.test(changes):
  22. self.cache.add(changeset)
  23. return False
  24. else:
  25. return True
  26. def run(self, changes, force=False):
  27. # Make sure the initial test passes, if not then (a) either
  28. # the user doesn't expect monotonicity, and we may end up
  29. # doing O(N^2) tests, or (b) the test is wrong. Avoid the
  30. # O(N^2) case unless user requests it.
  31. if not force:
  32. if not self.getTestResult(changes):
  33. raise ValueError('Initial test passed to delta fails.')
  34. # Check empty set first to quickly find poor test functions.
  35. if self.getTestResult(set()):
  36. return set()
  37. else:
  38. return self.delta(changes, self.split(changes))
  39. def split(self, S):
  40. """split(set) -> [sets]
  41. Partition a set into one or two pieces.
  42. """
  43. # There are many ways to split, we could do a better job with more
  44. # context information (but then the API becomes grosser).
  45. L = list(S)
  46. mid = len(L)//2
  47. if mid==0:
  48. return L,
  49. else:
  50. return L[:mid],L[mid:]
  51. def delta(self, c, sets):
  52. # assert(reduce(set.union, sets, set()) == c)
  53. # If there is nothing left we can remove, we are done.
  54. if len(sets) <= 1:
  55. return c
  56. # Look for a passing subset.
  57. res = self.search(c, sets)
  58. if res is not None:
  59. return res
  60. # Otherwise, partition sets if possible; if not we are done.
  61. refined = sum(map(list, map(self.split, sets)), [])
  62. if len(refined) == len(sets):
  63. return c
  64. return self.delta(c, refined)
  65. def search(self, c, sets):
  66. for i,S in enumerate(sets):
  67. # If test passes on this subset alone, recurse.
  68. if self.getTestResult(S):
  69. return self.delta(S, self.split(S))
  70. # Otherwise if we have more than two sets, see if test
  71. # pases without this subset.
  72. if len(sets) > 2:
  73. complement = sum(sets[:i] + sets[i+1:],[])
  74. if self.getTestResult(complement):
  75. return self.delta(complement, sets[:i] + sets[i+1:])
  76. ###
  77. class Token(object):
  78. def __init__(self, type, data, flags, file, line, column):
  79. self.type = type
  80. self.data = data
  81. self.flags = flags
  82. self.file = file
  83. self.line = line
  84. self.column = column
  85. kTokenRE = re.compile(r"""([a-z_]+) '(.*)'\t(.*)\tLoc=<(.*):(.*):(.*)>""",
  86. re.DOTALL | re.MULTILINE)
  87. def getTokens(path):
  88. p = subprocess.Popen(['clang','-dump-raw-tokens',path],
  89. stdin=subprocess.PIPE,
  90. stdout=subprocess.PIPE,
  91. stderr=subprocess.PIPE)
  92. out,err = p.communicate()
  93. tokens = []
  94. collect = None
  95. for ln in err.split('\n'):
  96. # Silly programmers refuse to print in simple machine readable
  97. # formats. Whatever.
  98. if collect is None:
  99. collect = ln
  100. else:
  101. collect = collect + '\n' + ln
  102. if 'Loc=<' in ln and ln.endswith('>'):
  103. ln,collect = collect,None
  104. tokens.append(Token(*kTokenRE.match(ln).groups()))
  105. return tokens
  106. ###
  107. class TMBDDelta(DeltaAlgorithm):
  108. def __init__(self, testProgram, tokenLists, log):
  109. def patchName(name, suffix):
  110. base,ext = os.path.splitext(name)
  111. return base + '.' + suffix + ext
  112. super(TMBDDelta, self).__init__()
  113. self.testProgram = testProgram
  114. self.tokenLists = tokenLists
  115. self.tempFiles = [patchName(f,'tmp')
  116. for f,_ in self.tokenLists]
  117. self.targetFiles = [patchName(f,'ok')
  118. for f,_ in self.tokenLists]
  119. self.log = log
  120. self.numTests = 0
  121. def writeFiles(self, changes, fileNames):
  122. assert len(fileNames) == len(self.tokenLists)
  123. byFile = [[] for i in self.tokenLists]
  124. for i,j in changes:
  125. byFile[i].append(j)
  126. for i,(file,tokens) in enumerate(self.tokenLists):
  127. f = open(fileNames[i],'w')
  128. for j in byFile[i]:
  129. f.write(tokens[j])
  130. f.close()
  131. return byFile
  132. def test(self, changes):
  133. self.numTests += 1
  134. byFile = self.writeFiles(changes, self.tempFiles)
  135. if self.log:
  136. print('TEST - ', end=' ', file=sys.stderr)
  137. if self.log > 1:
  138. for i,(file,_) in enumerate(self.tokenLists):
  139. indices = byFile[i]
  140. if i:
  141. sys.stderr.write('\n ')
  142. sys.stderr.write('%s:%d tokens: [' % (file,len(byFile[i])))
  143. prev = None
  144. for j in byFile[i]:
  145. if prev is None or j != prev + 1:
  146. if prev:
  147. sys.stderr.write('%d][' % prev)
  148. sys.stderr.write(str(j))
  149. sys.stderr.write(':')
  150. prev = j
  151. if byFile[i]:
  152. sys.stderr.write(str(byFile[i][-1]))
  153. sys.stderr.write('] ')
  154. else:
  155. print(', '.join(['%s:%d tokens' % (file, len(byFile[i]))
  156. for i,(file,_) in enumerate(self.tokenLists)]), end=' ', file=sys.stderr)
  157. p = subprocess.Popen([self.testProgram] + self.tempFiles)
  158. res = p.wait() == 0
  159. if res:
  160. self.writeFiles(changes, self.targetFiles)
  161. if self.log:
  162. print('=> %s' % res, file=sys.stderr)
  163. else:
  164. if res:
  165. print('\nSUCCESS (%d tokens)' % len(changes))
  166. else:
  167. sys.stderr.write('.')
  168. return res
  169. def run(self):
  170. res = super(TMBDDelta, self).run([(i,j)
  171. for i,(file,tokens) in enumerate(self.tokenLists)
  172. for j in range(len(tokens))])
  173. self.writeFiles(res, self.targetFiles)
  174. if not self.log:
  175. print(file=sys.stderr)
  176. return res
  177. def tokenBasedMultiDelta(program, files, log):
  178. # Read in the lists of tokens.
  179. tokenLists = [(file, [t.data for t in getTokens(file)])
  180. for file in files]
  181. numTokens = sum([len(tokens) for _,tokens in tokenLists])
  182. print("Delta on %s with %d tokens." % (', '.join(files), numTokens))
  183. tbmd = TMBDDelta(program, tokenLists, log)
  184. res = tbmd.run()
  185. print("Finished %s with %d tokens (in %d tests)." % (', '.join(tbmd.targetFiles),
  186. len(res),
  187. tbmd.numTests))
  188. def main():
  189. from optparse import OptionParser, OptionGroup
  190. parser = OptionParser("%prog <test program> {files+}")
  191. parser.add_option("", "--debug", dest="debugLevel",
  192. help="set debug level [default %default]",
  193. action="store", type=int, default=0)
  194. (opts, args) = parser.parse_args()
  195. if len(args) <= 1:
  196. parser.error('Invalid number of arguments.')
  197. program,files = args[0],args[1:]
  198. md = tokenBasedMultiDelta(program, files, log=opts.debugLevel)
  199. if __name__ == '__main__':
  200. try:
  201. main()
  202. except KeyboardInterrupt:
  203. print('Interrupted.', file=sys.stderr)
  204. os._exit(1) # Avoid freeing our giant cache.