split_cl.py 46 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193
  1. #!/usr/bin/env python3
  2. # Copyright 2017 The Chromium Authors. All rights reserved.
  3. # Use of this source code is governed by a BSD-style license that can be
  4. # found in the LICENSE file.
  5. """Splits a branch into smaller branches and uploads CLs."""
  6. import collections
  7. import dataclasses
  8. import hashlib
  9. import math
  10. import os
  11. import re
  12. import tempfile
  13. from typing import List, Set, Tuple, Dict, Any
  14. import gclient_utils
  15. import git_footers
  16. import scm
  17. import git_common as git
  18. # If a call to `git cl split` will generate more than this number of CLs, the
  19. # command will prompt the user to make sure they know what they're doing. Large
  20. # numbers of CLs generated by `git cl split` have caused infrastructure issues
  21. # in the past.
  22. CL_SPLIT_FORCE_LIMIT = 10
  23. # The maximum number of top reviewers to list. `git cl split` may send many CLs
  24. # to a single reviewer, so the top reviewers with the most CLs sent to them
  25. # will be listed.
  26. CL_SPLIT_TOP_REVIEWERS = 5
  27. def Emit(*msg: str):
  28. """Wrapper for easier mocking during tests"""
  29. print(*msg)
  30. def EmitWarning(*msg: str):
  31. print("Warning: ", *msg)
  32. def HashList(lst: List[Any]) -> str:
  33. """
  34. Hash a list, returning a positive integer. Lists with identical elements
  35. should have the same hash, regardless of order.
  36. """
  37. # We need a bytes-like object for hashlib algorithms
  38. byts = bytes().join(
  39. (action + file).encode() for action, file in sorted(lst))
  40. # No security implication: we just need a deterministic output
  41. hashed = hashlib.sha1(byts)
  42. return hashed.hexdigest()[:10]
  43. FilesAndOwnersDirectory = collections.namedtuple("FilesAndOwnersDirectory",
  44. "files owners_directories")
  45. @dataclasses.dataclass
  46. class CLInfo:
  47. """
  48. Data structure representing a single CL. The script will split the large CL
  49. into a list of these.
  50. Fields:
  51. - reviewers: the reviewers the CL will be sent to.
  52. - files: a list of <action>, <file> pairs in the CL.
  53. Has the same format as `git status`.
  54. - description: a string describing the CL. Typically the list of affected
  55. directories. Only used for replacing $description in
  56. the user-provided CL description.
  57. """
  58. # Have to use default_factory because lists are mutable
  59. reviewers: Set[str] = dataclasses.field(default_factory=set)
  60. files: List[Tuple[str, str]] = dataclasses.field(default_factory=list)
  61. # This is only used for formatting in the CL description, so it just
  62. # has to be convertible to string.
  63. description: Any = ""
  64. def FormatForPrinting(self) -> str:
  65. """
  66. Format the CLInfo for printing to a file in a human-readable format.
  67. """
  68. # Don't quote the reviewer emails in the output
  69. reviewers_str = ", ".join(self.reviewers)
  70. lines = [
  71. f"Reviewers: [{reviewers_str}]", f"Description: {self.description}"
  72. ] + [f"{action}, {file}" for (action, file) in self.files]
  73. return "\n".join(lines)
  74. def CLInfoFromFilesAndOwnersDirectoriesDict(
  75. d: Dict[Tuple[str], FilesAndOwnersDirectory]) -> List[CLInfo]:
  76. """
  77. Transform a dictionary mapping reviewer tuples to FilesAndOwnersDirectories
  78. into a list of CLInfo
  79. """
  80. cl_infos = []
  81. for (reviewers, fod) in d.items():
  82. cl_infos.append(
  83. CLInfo(set(reviewers), fod.files,
  84. FormatDirectoriesForPrinting(fod.owners_directories)))
  85. return cl_infos
  86. def EnsureInGitRepository():
  87. """Throws an exception if the current directory is not a git repository."""
  88. git.run('rev-parse')
  89. def GetGitInfo(repository_root, cl) -> Tuple[List[Tuple[str, str]], str, str]:
  90. """
  91. Get various information by running git commands.
  92. Specifically, determine which branch we're on, which upstream we're
  93. targeting, and the list of changed files (and the associated git actions)
  94. that make up the CL we're splitting.
  95. """
  96. upstream = cl.GetCommonAncestorWithUpstream()
  97. files = [(action.strip(), f)
  98. for action, f in scm.GIT.CaptureStatus(repository_root, upstream)]
  99. refactor_branch = git.current_branch()
  100. assert refactor_branch, "Can't run from detached branch."
  101. refactor_branch_upstream = git.upstream(refactor_branch)
  102. assert refactor_branch_upstream, \
  103. "Branch %s must have an upstream." % refactor_branch
  104. return files, refactor_branch, refactor_branch_upstream
  105. def CreateBranchName(prefix: str, files: List[Tuple[str, str]]) -> str:
  106. """
  107. Given a sub-CL as a list of (action, file) pairs, create a unique and
  108. deterministic branch name for it.
  109. The name has the format <prefix>_<dirname>_<hash(files)>_split.
  110. """
  111. file_names = [file for _, file in files]
  112. if len(file_names) == 1:
  113. # Only one file, just use its directory as the common path
  114. common_path = os.path.dirname(file_names[0])
  115. else:
  116. common_path = os.path.commonpath(file_names)
  117. if not common_path:
  118. # Files have nothing in common at all. Unlikely but possible.
  119. common_path = "None"
  120. # Replace path delimiter with underscore in common_path.
  121. common_path = common_path.replace(os.path.sep, '_')
  122. return f"{prefix}_{HashList(files)}_{common_path}_split"
  123. def CreateBranchForOneCL(prefix: str, files: List[Tuple[str, str]],
  124. upstream: str) -> bool:
  125. """Creates a branch named |prefix| + "_" + |hash(files)| + "_split".
  126. Return false if the branch already exists. |upstream| is used as upstream
  127. for the created branch.
  128. """
  129. branches_on_disk = set(git.branches(use_limit=False))
  130. branch_name = CreateBranchName(prefix, files)
  131. if branch_name in branches_on_disk:
  132. return False
  133. git.run('checkout', '-t', upstream, '-b', branch_name)
  134. return True
  135. def ValidateExistingBranches(prefix: str, cl_infos: List[CLInfo]) -> bool:
  136. """
  137. Check if there are splitting branches left over from a previous run.
  138. We only allow branches to exist if we're resuming a previous upload,
  139. in which case we require that the existing branches are a subset of
  140. the branches we're going to generate.
  141. """
  142. branches_on_disk = set(
  143. branch for branch in git.branches(use_limit=False)
  144. if branch.startswith(prefix + "_") and branch.endswith("_split"))
  145. branches_to_be_made = set(
  146. CreateBranchName(prefix, info.files) for info in cl_infos)
  147. if not branches_on_disk.issubset(branches_to_be_made):
  148. Emit("It seems like you've already run `git cl split` on this branch.\n"
  149. "If you're resuming a previous upload, you must pass in the "
  150. "same splitting as before, using the --from-file option.\n"
  151. "If you're starting a new upload, please clean up existing split "
  152. f"branches (starting with '{prefix}_' and ending with '_split'), "
  153. "and re-run the tool.")
  154. Emit("The following branches need to be cleaned up:\n")
  155. for branch in branches_on_disk - branches_to_be_made:
  156. Emit(branch)
  157. return False
  158. return True
  159. def FormatDirectoriesForPrinting(directories: List[str],
  160. prefix: str = None) -> str:
  161. """Formats directory list for printing
  162. Uses dedicated format for single-item list."""
  163. prefixed = directories
  164. if prefix:
  165. prefixed = [(prefix + d) for d in directories]
  166. return str(prefixed[0]) if len(prefixed) == 1 else str(prefixed)
  167. def FormatDescriptionOrComment(txt, desc):
  168. """Replaces $description with |desc| in |txt|."""
  169. # TODO(389069356): Remove support for $directory entirely once it's been
  170. # deprecated for a while.
  171. replaced_txt = txt.replace('$directory', desc)
  172. if txt != replaced_txt:
  173. EmitWarning('Usage of $directory is deprecated and will be removed '
  174. 'in a future update. Please use $description instead, '
  175. 'which has the same behavior by default.\n\n')
  176. replaced_txt = replaced_txt.replace('$description', desc)
  177. return replaced_txt
  178. def AddUploadedByGitClSplitToDescription(description, is_experimental=False):
  179. """Adds a 'This CL was uploaded by git cl split.' line to |description|.
  180. The line is added before footers, or at the end of |description| if it has
  181. no footers.
  182. """
  183. if is_experimental:
  184. new_lines = [
  185. 'This CL was uploaded by an experimental version of git cl split',
  186. '(https://crbug.com/389069356).'
  187. ]
  188. else:
  189. new_lines = ['This CL was uploaded by git cl split.']
  190. split_footers = git_footers.split_footers(description)
  191. lines = split_footers[0]
  192. if lines[-1] and not lines[-1].isspace():
  193. lines = lines + ['']
  194. lines = lines + new_lines
  195. if split_footers[1]:
  196. lines += [''] + split_footers[1]
  197. return '\n'.join(lines)
  198. def UploadCl(refactor_branch, refactor_branch_upstream, cl_description, files,
  199. user_description, saved_splitting_file, comment, reviewers,
  200. changelist, cmd_upload, cq_dry_run, enable_auto_submit, topic,
  201. repository_root):
  202. """Uploads a CL with all changes to |files| in |refactor_branch|.
  203. Args:
  204. refactor_branch: Name of the branch that contains the changes to upload.
  205. refactor_branch_upstream: Name of the upstream of |refactor_branch|.
  206. cl_description: Description of this specific CL, e.g. the list of
  207. affected directories.
  208. files: List of AffectedFile instances to include in the uploaded CL.
  209. user_description: Description provided by user.
  210. comment: Comment to post on the uploaded CL.
  211. reviewers: A set of reviewers for the CL.
  212. changelist: The Changelist class.
  213. cmd_upload: The function associated with the git cl upload command.
  214. cq_dry_run: If CL uploads should also do a cq dry run.
  215. enable_auto_submit: If CL uploads should also enable auto submit.
  216. topic: Topic to associate with uploaded CLs.
  217. """
  218. # Create a branch.
  219. if not CreateBranchForOneCL(refactor_branch, files,
  220. refactor_branch_upstream):
  221. Emit(
  222. f'Skipping existing branch for CL with description: {cl_description}'
  223. )
  224. return
  225. # Checkout all changes to files in |files|.
  226. deleted_files = []
  227. modified_files = []
  228. for action, f in files:
  229. abspath = os.path.abspath(os.path.join(repository_root, f))
  230. if action == 'D':
  231. deleted_files.append(abspath)
  232. else:
  233. modified_files.append(abspath)
  234. if deleted_files:
  235. git.run(*['rm'] + deleted_files)
  236. if modified_files:
  237. git.run(*['checkout', refactor_branch, '--'] + modified_files)
  238. # Commit changes. The temporary file is created with delete=False so that it
  239. # can be deleted manually after git has read it rather than automatically
  240. # when it is closed.
  241. with gclient_utils.temporary_file() as tmp_file:
  242. gclient_utils.FileWrite(
  243. tmp_file,
  244. FormatDescriptionOrComment(user_description, cl_description))
  245. git.run('commit', '-F', tmp_file)
  246. # Upload a CL.
  247. upload_args = ['-f']
  248. if reviewers:
  249. upload_args.extend(['-r', ','.join(sorted(reviewers))])
  250. if cq_dry_run:
  251. upload_args.append('--cq-dry-run')
  252. if not comment:
  253. upload_args.append('--send-mail')
  254. if enable_auto_submit:
  255. upload_args.append('--enable-auto-submit')
  256. if topic:
  257. upload_args.append('--topic={}'.format(topic))
  258. Emit(f'Uploading CL with description: {cl_description} ...')
  259. ret = cmd_upload(upload_args)
  260. if ret != 0:
  261. Emit('Uploading failed.')
  262. Emit('Note: git cl split has built-in resume capabilities.')
  263. Emit(f'Delete {git.current_branch()} then run\n'
  264. f'git cl split --from-file={saved_splitting_file}\n'
  265. 'to resume uploading.')
  266. if comment:
  267. changelist().AddComment(FormatDescriptionOrComment(
  268. comment, cl_description),
  269. publish=True)
  270. def GetFilesSplitByOwners(files, max_depth, repository_root):
  271. """Returns a map of files split by OWNERS file.
  272. Args:
  273. files: List of the file paths to be grouped by the OWNERS.
  274. Note that each path is relative to the repostiory root.
  275. max_depth: Max depth to traverse from the repository path.
  276. repository_root: Absolute path to the repository root.
  277. Returns:
  278. A map where keys are paths to directories containing an OWNERS file and
  279. values are lists of files sharing an OWNERS file.
  280. """
  281. files_split_by_owners = {}
  282. for action, path in files:
  283. # normpath() is important to normalize separators here, in prepration
  284. # for str.split() before. It would be nicer to use something like
  285. # pathlib here but alas...
  286. dir_with_owners = os.path.normpath(os.path.dirname(path))
  287. if max_depth >= 1:
  288. dir_with_owners = os.path.join(
  289. *dir_with_owners.split(os.path.sep)[:max_depth])
  290. # Find the closest parent directory with an OWNERS file.
  291. dir_with_owners = os.path.join(repository_root, dir_with_owners)
  292. while dir_with_owners != repository_root:
  293. if dir_with_owners in files_split_by_owners:
  294. break
  295. owners_path = os.path.join(dir_with_owners, 'OWNERS')
  296. if os.path.isfile(owners_path):
  297. break
  298. if os.path.lexists(owners_path):
  299. raise ClSplitParseError(
  300. f'{owners_path} exists, but is not a file')
  301. dir_with_owners = os.path.dirname(dir_with_owners)
  302. files_split_by_owners.setdefault(dir_with_owners, []).append(
  303. (action, path))
  304. return files_split_by_owners
  305. def PrintClInfo(cl_index, num_cls, cl_description, file_paths, user_description,
  306. reviewers, cq_dry_run, enable_auto_submit, topic):
  307. """Prints info about a CL.
  308. Args:
  309. cl_index: The index of this CL in the list of CLs to upload.
  310. num_cls: The total number of CLs that will be uploaded.
  311. cl_description: Description of this specific CL, e.g. the list of
  312. affected directories.
  313. file_paths: A list of files in this CL.
  314. user_description: Description provided by user.
  315. reviewers: A set of reviewers for this CL.
  316. cq_dry_run: If the CL should also be sent to CQ dry run.
  317. enable_auto_submit: If the CL should also have auto submit enabled.
  318. topic: Topic to set for this CL.
  319. """
  320. description_lines = FormatDescriptionOrComment(user_description,
  321. cl_description).splitlines()
  322. indented_description = '\n'.join([' ' + l for l in description_lines])
  323. Emit('CL {}/{}'.format(cl_index, num_cls))
  324. Emit('Paths: {}'.format(cl_description))
  325. Emit('Reviewers: {}'.format(', '.join(reviewers)))
  326. Emit('Auto-Submit: {}'.format(enable_auto_submit))
  327. Emit('CQ Dry Run: {}'.format(cq_dry_run))
  328. Emit('Topic: {}'.format(topic))
  329. Emit('\n' + indented_description + '\n')
  330. Emit('\n'.join(file_paths))
  331. def LoadDescription(description_file, dry_run):
  332. if not description_file:
  333. if not dry_run:
  334. # Parser checks this as well, so should be impossible
  335. raise ValueError(
  336. "Must provide a description file except during dry runs")
  337. return ('Dummy description for dry run.\n'
  338. 'description = $description')
  339. return gclient_utils.FileRead(description_file)
  340. def ProcessDescription(description_file: str, dry_run: bool,
  341. target_range: bool) -> str:
  342. """
  343. Load the provided description, append the note about git cl split, and
  344. (on a real run), validate that it contains a bug link.
  345. Returns the loaded description, or None if the user aborted due to a
  346. missing bug link.
  347. """
  348. description = LoadDescription(description_file, dry_run)
  349. description = AddUploadedByGitClSplitToDescription(
  350. description, is_experimental=target_range)
  351. if not dry_run and not CheckDescriptionBugLink(description):
  352. return None
  353. return description
  354. def PrintSummary(cl_infos, refactor_branch):
  355. """Print a brief summary of the splitting so the user
  356. can review it before uploading.
  357. Args:
  358. files_split_by_reviewers: A dictionary mapping reviewer tuples
  359. to the files and directories assigned to them.
  360. """
  361. for info in cl_infos:
  362. Emit(f'Reviewers: {info.reviewers}, files: {len(info.files)}, '
  363. f'description: {info.description}')
  364. num_cls = len(cl_infos)
  365. Emit(f'\nWill split branch {refactor_branch} into {num_cls} CLs. '
  366. 'Please quickly review them before proceeding.\n')
  367. if (num_cls > CL_SPLIT_FORCE_LIMIT):
  368. EmitWarning(
  369. 'Uploading this many CLs may potentially '
  370. 'reach the limit of concurrent runs, imposed on you by the '
  371. 'build infrastructure. Your runs may be throttled as a '
  372. 'result.\n\nPlease email infra-dev@chromium.org if you '
  373. 'have any questions. '
  374. 'The infra team reserves the right to cancel '
  375. 'your jobs if they are overloading the CQ.\n\n'
  376. '(Alternatively, you can reduce the number of CLs created by '
  377. 'using the --max-depth option, or altering the arguments to '
  378. '--target-range, as appropriate. Pass --dry-run to examine the '
  379. 'CLs which will be created until you are happy with the '
  380. 'results.)')
  381. def SummarizeAndValidate(dry_run: bool, summarize: bool,
  382. files: List[Tuple[str, str]], refactor_branch: str,
  383. cl_infos: List[CLInfo]) -> Tuple[List[CLInfo], str]:
  384. """
  385. Print a summary of the generated splitting for the user. If we're doing a
  386. real run, prompt the user to confirm the splitting is acceptable, and
  387. allow them to edit it if they wish.
  388. If we're doing a real run, also save the splitting to a file so the user
  389. can safely resume an aborted upload with the same splitting.
  390. Arguments:
  391. dry_run: Whether or not we're doing a dry run
  392. summarize: If we're doing a dry run, should we print a concise summary first
  393. files: The list of (action, file) pairs that make up the CL we're splitting
  394. refactor_branch: Name of the branch we're splitting
  395. Returns:
  396. A pair of the edited cl_infos and the name of the file to which we saved
  397. the splitting. If the user aborts, the edited cl_infos will be falsy.
  398. """
  399. if not dry_run or summarize:
  400. PrintSummary(cl_infos, refactor_branch)
  401. if dry_run:
  402. return cl_infos, ""
  403. answer = gclient_utils.AskForData(
  404. 'Proceed? (y/N, or i to edit interactively): ')
  405. if answer.lower() == 'i':
  406. cl_infos, saved_splitting_file = EditSplittingInteractively(
  407. cl_infos, files_on_disk=files)
  408. else:
  409. # Save so the user can use the splitting later if they want to
  410. saved_splitting_file = SaveSplittingToTempFile(cl_infos)
  411. if answer.lower() != 'y':
  412. return None, saved_splitting_file
  413. # Make sure there isn't any clutter left over from a previous run
  414. if not ValidateExistingBranches(refactor_branch, cl_infos):
  415. return None, saved_splitting_file
  416. return cl_infos, saved_splitting_file
  417. def ComputeSplitting(
  418. from_file: str,
  419. files: List[Tuple[str, str]],
  420. target_range: Tuple[int, int],
  421. max_depth: int,
  422. reviewers_override: List[str],
  423. expect_owners_override: bool,
  424. cl,
  425. repository_root: str,
  426. ) -> List[CLInfo]:
  427. """
  428. Split the current CL into sub-CLs by partitioning the files and assigning
  429. reviewers. The method used depends on the command-line arguments.
  430. Arguments are the same as SplitCl, excecpt for the following:
  431. cl: Changelist class instance, for calling owners methods
  432. """
  433. author = git.run('config', 'user.email').strip() or None
  434. if from_file:
  435. # Load a precomputed splitting
  436. cl_infos = LoadSplittingFromFile(from_file, files_on_disk=files)
  437. elif target_range:
  438. # Use the directory-based clustering algorithm
  439. min_files, max_files = target_range
  440. cl_infos = GroupFilesByDirectory(cl, author, expect_owners_override,
  441. files, min_files, max_files)
  442. else:
  443. # Use the default algorithm
  444. files_split_by_reviewers = SelectReviewersForFiles(
  445. cl, author, files, max_depth, repository_root)
  446. cl_infos = CLInfoFromFilesAndOwnersDirectoriesDict(
  447. files_split_by_reviewers)
  448. # Note that we do this override even if the list is empty (indicating that
  449. # the user requested CLs not be assigned to any reviewers).
  450. if reviewers_override != None:
  451. for info in cl_infos:
  452. info.reviewers = set(reviewers_override)
  453. return cl_infos
  454. def SplitCl(description_file, comment_file, changelist, cmd_upload, dry_run,
  455. summarize, reviewers_override, cq_dry_run, enable_auto_submit,
  456. max_depth, topic, target_range, expect_owners_override, from_file,
  457. repository_root):
  458. """"Splits a branch into smaller branches and uploads CLs.
  459. Args:
  460. description_file: File containing the description of uploaded CLs.
  461. comment_file: File containing the comment of uploaded CLs.
  462. changelist: The Changelist class.
  463. cmd_upload: The function associated with the git cl upload command.
  464. dry_run: Whether this is a dry run (no branches or CLs created).
  465. reviewers_override: Either None or a (possibly empty) list of reviewers
  466. all CLs should be sent to.
  467. cq_dry_run: If CL uploads should also do a cq dry run.
  468. enable_auto_submit: If CL uploads should also enable auto submit.
  469. max_depth: The maximum directory depth to search for OWNERS files. A
  470. value less than 1 means no limit.
  471. topic: Topic to associate with split CLs.
  472. repository_root: Absolute path of the repository root.
  473. Returns:
  474. 0 in case of success. 1 in case of error.
  475. """
  476. EnsureInGitRepository()
  477. cl = changelist()
  478. # Get the list of changed files, as well as the branch we're on and its
  479. # upstream.
  480. files, refactor_branch, refactor_branch_upstream = GetGitInfo(
  481. repository_root, cl)
  482. if not files:
  483. Emit('Cannot split an empty CL.')
  484. return 1
  485. # Load and validate the description and comment files now, so we can error
  486. # early if there's a problem with them.
  487. comment = gclient_utils.FileRead(comment_file) if comment_file else None
  488. description = ProcessDescription(description_file, dry_run, target_range)
  489. if not description:
  490. return 0
  491. cl_infos = ComputeSplitting(from_file, files, target_range, max_depth,
  492. reviewers_override, expect_owners_override, cl,
  493. repository_root)
  494. cl_infos, saved_splitting_file = SummarizeAndValidate(
  495. dry_run, summarize, files, refactor_branch, cl_infos)
  496. # If the user aborted, we're done
  497. if not cl_infos:
  498. return 0
  499. cls_per_reviewer = collections.defaultdict(int)
  500. for cl_index, cl_info in enumerate(cl_infos, 1):
  501. if dry_run and summarize:
  502. pass
  503. elif dry_run:
  504. file_paths = [f for _, f in cl_info.files]
  505. PrintClInfo(cl_index, len(cl_infos), cl_info.description,
  506. file_paths, description, cl_info.reviewers, cq_dry_run,
  507. enable_auto_submit, topic)
  508. else:
  509. UploadCl(refactor_branch, refactor_branch_upstream,
  510. cl_info.description, cl_info.files, description,
  511. saved_splitting_file, comment, cl_info.reviewers,
  512. changelist, cmd_upload, cq_dry_run, enable_auto_submit,
  513. topic, repository_root)
  514. for reviewer in cl_info.reviewers:
  515. cls_per_reviewer[reviewer] += 1
  516. # List the top reviewers that will be sent the most CLs as a result of
  517. # the split.
  518. reviewer_rankings = sorted(cls_per_reviewer.items(),
  519. key=lambda item: item[1],
  520. reverse=True)
  521. Emit('The top reviewers are:')
  522. for reviewer, count in reviewer_rankings[:CL_SPLIT_TOP_REVIEWERS]:
  523. Emit(f' {reviewer}: {count} CLs')
  524. if dry_run:
  525. # Wait until now to save the splitting so the file name doesn't get
  526. # washed away by the flood of dry-run printing.
  527. SaveSplittingToTempFile(cl_infos)
  528. # Go back to the original branch.
  529. git.run('checkout', refactor_branch)
  530. return 0
  531. def CheckDescriptionBugLink(description):
  532. """Verifies that the description contains a bug link.
  533. Examples:
  534. Bug: 123
  535. Bug: chromium:456
  536. Prompts user if the description does not contain a bug link.
  537. """
  538. bug_pattern = re.compile(r"^Bug:\s*(?:[a-zA-Z]+:)?[0-9]+", re.MULTILINE)
  539. matches = re.findall(bug_pattern, description)
  540. answer = 'y'
  541. if not matches:
  542. answer = gclient_utils.AskForData(
  543. 'Description does not include a bug link. Proceed? (y/N):')
  544. return answer.lower() == 'y'
  545. def SelectReviewersForFiles(cl, author, files, max_depth, repository_root):
  546. """Selects reviewers for passed-in files
  547. Args:
  548. cl: Changelist class instance
  549. author: Email of person running 'git cl split'
  550. files: List of files
  551. max_depth: The maximum directory depth to search for OWNERS files.
  552. A value less than 1 means no limit.
  553. repository_root: Absolute path of the repository root
  554. """
  555. info_split_by_owners = GetFilesSplitByOwners(files, max_depth,
  556. repository_root)
  557. info_split_by_reviewers = {}
  558. for (directory, split_files) in info_split_by_owners.items():
  559. # Use '/' as a path separator in the branch name and the CL description
  560. # and comment.
  561. directory = directory.replace(os.path.sep, '/')
  562. file_paths = [f for _, f in split_files]
  563. # Convert reviewers list to tuple in order to use reviewers as key to
  564. # dictionary.
  565. reviewers = tuple(
  566. cl.owners_client.SuggestOwners(
  567. file_paths, exclude=[author, cl.owners_client.EVERYONE]))
  568. if not reviewers in info_split_by_reviewers:
  569. info_split_by_reviewers[reviewers] = FilesAndOwnersDirectory([], [])
  570. info_split_by_reviewers[reviewers].files.extend(split_files)
  571. info_split_by_reviewers[reviewers].owners_directories.append(directory)
  572. return info_split_by_reviewers
  573. ################################################################################
  574. # Code for saving, editing, and loading splittings.
  575. ################################################################################
  576. def SaveSplittingToFile(cl_infos: List[CLInfo], filename: str, silent=False):
  577. """
  578. Writes the listed CLs to the designated file, in a human-readable and
  579. editable format. Include an explanation of the file format at the top,
  580. as well as instructions for how to use it.
  581. """
  582. preamble = (
  583. "# CLs in this file must have the following format:\n"
  584. "# A 'Reviewers: [...]' line, where '...' is a (possibly empty) list "
  585. "of reviewer emails.\n"
  586. "# A 'Description: ...' line, where '...' is any string (by default, "
  587. "the list of directories the files have been pulled from).\n"
  588. "# One or more file lines, consisting of an <action>, <file> pair, in "
  589. "the format output by `git status`.\n\n"
  590. "# Each 'Reviewers' line begins a new CL.\n"
  591. "# To use the splitting in this file, use the --from-file option.\n\n")
  592. cl_string = "\n\n".join([info.FormatForPrinting() for info in cl_infos])
  593. gclient_utils.FileWrite(filename, preamble + cl_string)
  594. if not silent:
  595. Emit(f"Saved splitting to {filename}")
  596. def SaveSplittingToTempFile(cl_infos: List[CLInfo], silent=False):
  597. """
  598. Create a file in the user's temp directory, and save the splitting there.
  599. """
  600. # We can't use gclient_utils.temporary_file because it will be removed
  601. temp_file, temp_name = tempfile.mkstemp(prefix="split_cl_")
  602. os.close(temp_file) # Necessary for windows
  603. SaveSplittingToFile(cl_infos, temp_name, silent)
  604. return temp_name
  605. class ClSplitParseError(Exception):
  606. pass
  607. # Matches 'Reviewers: [...]', extracts the ...
  608. reviewers_re = re.compile(r'Reviewers:\s*\[([^\]]*)\]')
  609. # Matches 'Description: ...', extracts the ...
  610. description_re = re.compile(r'Description:\s*(.+)')
  611. # Matches '<action>, <file>', and extracts both
  612. # <action> must be a valid code (either 1 or 2 letters)
  613. file_re = re.compile(r'([MTADRC]{1,2}),\s*(.+)')
  614. # We use regex parsing instead of e.g. json because it lets us use a much more
  615. # human-readable format, similar to the summary printed in dry runs
  616. def ParseSplittings(lines: List[str]) -> List[CLInfo]:
  617. """
  618. Parse a splitting file. We expect to get a series of lines in the format
  619. of CLInfo.FormatForPrinting. In the following order, we expect to see
  620. - A 'Reviewers: ' line containing a list,
  621. - A 'Description: ' line containing anything, and
  622. - A list of <action>, <path> pairs, each on its own line
  623. Note that this function only transforms the file into a list of CLInfo
  624. (if possible). It does not validate the information; for that, see
  625. ValidateSplitting.
  626. """
  627. cl_infos = []
  628. current_cl_info = None
  629. for line in lines:
  630. line = line.strip()
  631. # Skip empty or commented lines
  632. if not line or line.startswith('#'):
  633. continue
  634. # Start a new CL whenever we see a new Reviewers: line
  635. m = re.fullmatch(reviewers_re, line)
  636. if m:
  637. reviewers_str = m.group(1)
  638. reviewers = [r.strip() for r in reviewers_str.split(",")]
  639. # Account for empty list or trailing comma
  640. if not reviewers[-1]:
  641. reviewers = reviewers[:-1]
  642. if current_cl_info:
  643. cl_infos.append(current_cl_info)
  644. current_cl_info = CLInfo(reviewers=reviewers)
  645. continue
  646. if not current_cl_info:
  647. # Make sure no nonempty lines appear before the first CL
  648. raise ClSplitParseError(
  649. f"Error: Line appears before the first 'Reviewers: ' line:\n{line}"
  650. )
  651. # Description is just used as a description, so any string is fine
  652. m = re.fullmatch(description_re, line)
  653. if m:
  654. if current_cl_info.description:
  655. raise ClSplitParseError(
  656. f"Error parsing line: CL already has a description entry\n{line}"
  657. )
  658. current_cl_info.description = m.group(1).strip()
  659. continue
  660. # Any other line is presumed to be an '<action>, <file>' pair
  661. m = re.fullmatch(file_re, line)
  662. if m:
  663. action, path = m.groups()
  664. current_cl_info.files.append((action, path))
  665. continue
  666. raise ClSplitParseError("Error parsing line: Does not look like\n"
  667. "'Reviewers: [...]',\n"
  668. "'Description: ...', or\n"
  669. f"a pair of '<action>, <file>':\n{line}")
  670. if (current_cl_info):
  671. cl_infos.append(current_cl_info)
  672. return cl_infos
  673. def ValidateSplitting(cl_infos: List[CLInfo], filename: str,
  674. files_on_disk: List[Tuple[str, str]]):
  675. """
  676. Ensure that the provided list of CLs is a valid splitting.
  677. Specifically, check that:
  678. - Each file is in at most one CL
  679. - Each file and action appear in the list of changed files reported by git
  680. - Warn if some files don't appear in any CL
  681. - Warn if a reviewer string looks wrong, or if a CL is empty
  682. """
  683. # Validate the parsed information
  684. if not cl_infos:
  685. EmitWarning("No CLs listed in file. No action will be taken.")
  686. return []
  687. files_in_loaded_cls = set()
  688. # Collect all files, ensuring no duplicates
  689. # Warn on empty CLs or invalid reviewer strings
  690. for info in cl_infos:
  691. if not info.files:
  692. EmitWarning("CL has no files, and will be skipped:\n",
  693. info.FormatForPrinting())
  694. for file_info in info.files:
  695. if file_info in files_in_loaded_cls:
  696. raise ClSplitParseError(
  697. f"File appears in multiple CLs in {filename}:\n{file_info}")
  698. files_in_loaded_cls.add(file_info)
  699. for reviewer in info.reviewers:
  700. if not (re.fullmatch(r"[^@]+@[^.]+\..+", reviewer)):
  701. EmitWarning("reviewer does not look like an email address: ",
  702. reviewer)
  703. # Strip empty CLs
  704. cl_infos = [info for info in cl_infos if info.files]
  705. # Ensure the files in the user-provided CL splitting match the files
  706. # that git reports.
  707. # Warn if not all the files git reports appear.
  708. # Fail if the user mentions a file that isn't reported by git
  709. files_on_disk = set(files_on_disk)
  710. if not files_in_loaded_cls.issubset(files_on_disk):
  711. extra_files = files_in_loaded_cls.difference(files_on_disk)
  712. extra_files_str = "\n".join(f"{action}, {file}"
  713. for (action, file) in extra_files)
  714. raise ClSplitParseError(
  715. f"Some files are listed in {filename} but do not match any files "
  716. f"listed by git:\n{extra_files_str}")
  717. unmentioned_files = files_on_disk.difference(files_in_loaded_cls)
  718. if (unmentioned_files):
  719. EmitWarning(
  720. "the following files are not included in any CL in {filename}. "
  721. "They will not be uploaded:")
  722. for file in unmentioned_files:
  723. Emit(file)
  724. def LoadSplittingFromFile(filename: str,
  725. files_on_disk: List[Tuple[str, str]]) -> List[CLInfo]:
  726. """
  727. Given a file and the list of <action>, <file> pairs reported by git,
  728. read the file and return the list of CLInfos it contains.
  729. """
  730. lines = gclient_utils.FileRead(filename).splitlines()
  731. cl_infos = ParseSplittings(lines)
  732. ValidateSplitting(cl_infos, filename, files_on_disk)
  733. return cl_infos
  734. def EditSplittingInteractively(
  735. cl_infos: List[CLInfo],
  736. files_on_disk: List[Tuple[str, str]]) -> Tuple[List[CLInfo], str]:
  737. """
  738. Allow the user to edit the generated splitting using their default editor.
  739. Make sure the edited splitting is saved so they can retrieve it if needed.
  740. """
  741. tmp_file = SaveSplittingToTempFile(cl_infos, silent=True)
  742. splitting = gclient_utils.RunEditor(gclient_utils.FileRead(tmp_file), False)
  743. cl_infos = ParseSplittings(splitting.splitlines())
  744. # Save the edited splitting before validation, so the user can go back
  745. # and edit it if there are any typos
  746. SaveSplittingToFile(cl_infos, tmp_file)
  747. ValidateSplitting(cl_infos, "the provided splitting", files_on_disk)
  748. return cl_infos, tmp_file
  749. ################################################################################
  750. # Code for the clustering-based splitting algorithm.
  751. ################################################################################
  752. def GroupFilesByDirectory(cl, author: str, expect_owners_override: bool,
  753. all_files: Tuple[str, str], min_files: int,
  754. max_files: int) -> List[CLInfo]:
  755. """
  756. Group the contents of |all_files| into clusters of size between |min_files|
  757. and |max_files|, inclusive, based on their directory structure. Assign one
  758. reviewer to each group to create a CL. If |expect_owners_override| is true,
  759. consider only the directory structure of the files, ignoring ownership.
  760. May rarely create groups with fewer than |min_files| files, or assign
  761. multiple reviewers to a single CL.
  762. Args:
  763. cl: Changelist class instance, for calling owners methods
  764. author: Email of person running the script; never assigned as a reviewer
  765. """
  766. # Record the actions associated with each file because the clustering
  767. # algorithm just takes filenames
  768. actions_by_file = {}
  769. file_paths = []
  770. for (action, file) in all_files:
  771. actions_by_file[file] = action
  772. file_paths.append(file)
  773. reviewers_so_far = []
  774. cls = []
  775. # Go through the clusters by path length so that we're likely to choose
  776. # top-level owners earlier
  777. for (directories, files) in sorted(
  778. ClusterFiles(expect_owners_override, file_paths, min_files,
  779. max_files)):
  780. # Use '/' as a path separator in the branch name and the CL description
  781. # and comment.
  782. directories = [
  783. directory.replace(os.path.sep, '/') for directory in directories
  784. ]
  785. files_with_actions = [(actions_by_file[file], file) for file in files]
  786. # Try to find a reviewer. If some of the files have noparent set,
  787. # we'll likely get multiple reviewers. Don't consider reviewers we've
  788. # already assigned something to.
  789. # FIXME: Rather than excluding existing reviewers, it would be better
  790. # to just penalize them, but still choose them over reviewers who have
  791. # a worse score. At the moment, owners_client doesn't support anything
  792. # to do with the score.
  793. reviewers = cl.owners_client.SuggestMinimalOwners(
  794. files,
  795. exclude=[author, cl.owners_client.EVERYONE] + reviewers_so_far)
  796. # Retry without excluding existing reviewers if we couldn't find any.
  797. # This is very unlikely since there are many fallback owners.
  798. if not reviewers:
  799. reviewers = cl.owners_client.SuggestMinimalOwners(
  800. directories, exclude=[author, cl.owners_client.EVERYONE])
  801. reviewers_so_far.extend(reviewers)
  802. cls.append(
  803. CLInfo(set(reviewers), files_with_actions,
  804. FormatDirectoriesForPrinting(directories)))
  805. return cls
  806. ### Trie Code
  807. def FolderHasParent(path: str) -> bool:
  808. """
  809. Check if a folder inherits owners from a higher-level directory:
  810. i.e. it's not at top level, and doesn't have an OWNERS file that contains
  811. `set noparent`
  812. """
  813. # Treat each top-leve directory as having no parent, as well as the root
  814. # directory.
  815. if len(path.split(os.path.sep)) <= 1:
  816. # Top level
  817. return False
  818. owners_file = os.path.join(path, 'OWNERS')
  819. if (os.path.isfile(owners_file)):
  820. with (open(owners_file)) as f:
  821. for line in f.readlines():
  822. # Strip whitespace and comments
  823. line = line.split('#')[0].strip()
  824. if (line == 'set noparent'):
  825. return False
  826. return True
  827. class DirectoryTrie():
  828. """
  829. Trie structure: Nested dictionaries representing file paths.
  830. Each level represents one folder, and contains:
  831. - The path to that folder (its prefix)
  832. - A list of files that reside in that folder
  833. - A boolean for whether that folder inherits owners from a parent folder
  834. - One Trie representing each of that folder's subdirectories
  835. Files are stored with their entire path, so we don't need to reconstruct
  836. it every time we read them.
  837. """
  838. def __init__(self, expect_owners_override, prefix: str = ""):
  839. """ Create an empty DirectoryTrie with the specified prefix """
  840. has_parent = expect_owners_override or FolderHasParent(prefix)
  841. # yapf: disable
  842. self.subdirectories : Dict[str, DirectoryTrie] = {}
  843. self.files : List[str] = []
  844. self.prefix : str = prefix
  845. self.has_parent : bool = has_parent
  846. self.expect_owners_override : bool = expect_owners_override
  847. # yapf: enable
  848. def AddFile(self, path: List[str]):
  849. """
  850. Add a file to the Trie, adding new subdirectories if necessary.
  851. The file should be represented as a list of directories, with the final
  852. entry being the filename.
  853. """
  854. if len(path) == 1:
  855. self.files.append(os.path.join(self.prefix, path[0]))
  856. else:
  857. directory = path[0]
  858. if directory not in self.subdirectories:
  859. prefix = os.path.join(self.prefix, directory)
  860. self.subdirectories[directory] = DirectoryTrie(
  861. self.expect_owners_override, prefix)
  862. self.subdirectories[directory].AddFile(path[1:])
  863. def AddFiles(self, paths: List[List[str]]):
  864. """ Convenience function to add many files at once. """
  865. for path in paths:
  866. self.AddFile(path)
  867. def ToList(self) -> List[str]:
  868. """ Return a list of all files in the trie. """
  869. files = []
  870. files += self.files
  871. for subdir in self.subdirectories.values():
  872. files += subdir.ToList()
  873. return files
  874. ### Clustering code
  875. # Convenience type: a "bin" represents a collection of files:
  876. # it tracks their prefix(es) and the list of files themselves.
  877. # Both elements are string lists.
  878. Bin = collections.namedtuple("Bin", "prefixes files")
  879. def PackFiles(max_size: int, files_to_pack: List[Bin]) -> List[Bin]:
  880. """
  881. Simple bin packing algorithm: given a list of small bins, consolidate them
  882. into as few larger bins as possible, where each bin can hold at most
  883. |max_size| files.
  884. """
  885. bins = []
  886. # Guess how many bins we'll need ahead of time so we can spread things
  887. # between them. We'll add more bins later if necessary
  888. expected_bins_needed = math.ceil(
  889. sum(len(bin.files) for bin in files_to_pack) / max_size)
  890. expected_avg_bin_size = math.ceil(
  891. sum(len(bin.files) for bin in files_to_pack) / expected_bins_needed)
  892. for _ in range(expected_bins_needed):
  893. bins.append(Bin([], []))
  894. # Sort by number of files, decreasing
  895. sorted_by_num_files = sorted(files_to_pack, key=lambda bin: -len(bin.files))
  896. # Invariant: the least-filled bin is always the first element of |bins|
  897. # This ensures we spread things between bins as much as possible.
  898. for (prefixes, files) in sorted_by_num_files:
  899. b = bins[0]
  900. if len(b.files) + len(files) <= max_size:
  901. b[0].extend(prefixes)
  902. b[1].extend(files)
  903. else:
  904. # Since the first bin is the emptiest, if we failed to fit in
  905. # that we don't need to try any others.
  906. # If these files alone are too large, split them up into
  907. # groups of size |expected_avg_bin_size|
  908. if len(files) > max_size:
  909. bins.extend([
  910. Bin(prefixes, files[i:i + expected_avg_bin_size])
  911. for i in range(0, len(files), expected_avg_bin_size)
  912. ])
  913. else:
  914. bins.append(Bin(prefixes, files))
  915. # Maintain invariant
  916. bins.sort(key=lambda bin: len(bin.files))
  917. return [bin for bin in bins if len(bin.files) > 0]
  918. def ClusterFiles(expect_owners_override: bool, files: List[str], min_files: int,
  919. max_files: int) -> List[Bin]:
  920. """
  921. Group the entries of |files| into clusters of size between |min_files| and
  922. |max_files|, inclusive. Guarantees that the size does not exceed
  923. |max_files|, but the size may rarely be less than |min_files|. If
  924. |expect_owners_override| is true, don't consider ownership when clustering,
  925. only directory structure.
  926. Clustering strategy for a given directory:
  927. 1. Try to group each subdirectory independently
  928. 2. Group any remaining files as follows:
  929. 2a. If there are less than |min_files| files and the folder has a parent,
  930. give up and let the parent folder handle it.
  931. 2c. Otherwise, if there are at most |max_files| files, create one
  932. cluster.
  933. 2c. Finally, if there are more than |max_files| files, create several
  934. clusters of size less than |max_files|.
  935. """
  936. trie = DirectoryTrie(expect_owners_override)
  937. trie.AddFiles([file.split(os.path.sep) for file in files])
  938. clusters: List[Bin] = []
  939. def ClusterDirectory(current_dir: DirectoryTrie) -> List[str]:
  940. """
  941. Attempt to cluster the files for a directory, by grouping them into
  942. Bins and appending the bins to |clusters|.
  943. Returns a list of files that weren't able to be clustered (because
  944. there weren't at least |min_files| files).
  945. """
  946. # Track all the files we need to handle in this directory
  947. unclustered_files: List[Bin] = []
  948. # Record any files that live in this directory directly
  949. if len(current_dir.files) > 0:
  950. unclustered_files.append(
  951. Bin([current_dir.prefix], current_dir.files))
  952. # Step 1: Try to cluster each subdirectory independently
  953. for subdir in current_dir.subdirectories.values():
  954. unclustered_files_in_subdir = ClusterDirectory(subdir)
  955. # If not all files were submitted, record them
  956. if len(unclustered_files_in_subdir) > 0:
  957. unclustered_files.append(
  958. Bin([subdir.prefix], unclustered_files_in_subdir))
  959. # A flattened list containing just the names of all unclustered files
  960. unclustered_files_names_only = [
  961. file for bin in unclustered_files for file in bin.files
  962. ]
  963. if len(unclustered_files_names_only) == 0:
  964. return []
  965. # Step 2a: If we don't have enough files for a cluster and it's possible
  966. # to recurse upward, do so
  967. if (len(unclustered_files_names_only) < min_files
  968. and current_dir.has_parent):
  969. return unclustered_files_names_only
  970. # Step 2b, 2c: Create one or more clusters from the unclustered files
  971. # by appending to the |clusters| variable in the outer scope
  972. nonlocal clusters
  973. if len(unclustered_files_names_only) <= max_files:
  974. clusters.append(
  975. Bin([current_dir.prefix], unclustered_files_names_only))
  976. else:
  977. clusters += PackFiles(max_files, unclustered_files)
  978. return []
  979. unclustered_paths = ClusterDirectory(trie)
  980. if (len(unclustered_paths) > 0):
  981. EmitWarning(
  982. 'Not all files were assigned to a CL!\n'
  983. 'This should be impossible, file a bug.\n'
  984. f'{len(unclustered_paths)} Unassigned files: {unclustered_paths}')
  985. return clusters