download_from_google_storage_unittest.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496
  1. #!/usr/bin/env vpython3
  2. # Copyright (c) 2012 The Chromium Authors. All rights reserved.
  3. # Use of this source code is governed by a BSD-style license that can be
  4. # found in the LICENSE file.
  5. # pylint: disable=protected-access
  6. """Unit tests for download_from_google_storage.py."""
  7. import optparse
  8. import os
  9. import queue
  10. import shutil
  11. import sys
  12. import tarfile
  13. import tempfile
  14. import threading
  15. import unittest
  16. sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  17. import upload_to_google_storage
  18. import download_from_google_storage
  19. # ../third_party/gsutil/gsutil
  20. GSUTIL_DEFAULT_PATH = os.path.join(
  21. os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'gsutil.py')
  22. TEST_DIR = os.path.dirname(os.path.abspath(__file__))
  23. class GsutilMock(object):
  24. def __init__(self, path, boto_path, timeout=None):
  25. self.path = path
  26. self.timeout = timeout
  27. self.boto_path = boto_path
  28. self.expected = []
  29. self.history = []
  30. self.lock = threading.Lock()
  31. def add_expected(self, return_code, out, err, fn=None):
  32. self.expected.append((return_code, out, err, fn))
  33. def append_history(self, method, args):
  34. self.history.append((method, args))
  35. def call(self, *args):
  36. with self.lock:
  37. self.append_history('call', args)
  38. if self.expected:
  39. code, _out, _err, fn = self.expected.pop(0)
  40. if fn:
  41. fn()
  42. return code
  43. return 0
  44. def check_call(self, *args):
  45. with self.lock:
  46. self.append_history('check_call', args)
  47. if self.expected:
  48. code, out, err, fn = self.expected.pop(0)
  49. if fn:
  50. fn()
  51. return code, out, err
  52. return (0, '', '')
  53. def check_call_with_retries(self, *args):
  54. return self.check_call(*args)
  55. class ChangedWorkingDirectory(object):
  56. def __init__(self, working_directory):
  57. self._old_cwd = ''
  58. self._working_directory = working_directory
  59. def __enter__(self):
  60. self._old_cwd = os.getcwd()
  61. print("Enter directory = ", self._working_directory)
  62. os.chdir(self._working_directory)
  63. def __exit__(self, *_):
  64. print("Enter directory = ", self._old_cwd)
  65. os.chdir(self._old_cwd)
  66. class GstoolsUnitTests(unittest.TestCase):
  67. def setUp(self):
  68. self.temp_dir = tempfile.mkdtemp(prefix='gstools_test')
  69. self.base_path = os.path.join(self.temp_dir, 'test_files')
  70. shutil.copytree(os.path.join(TEST_DIR, 'gstools'), self.base_path)
  71. def tearDown(self):
  72. shutil.rmtree(self.temp_dir)
  73. def test_validate_tar_file(self):
  74. lorem_ipsum = os.path.join(self.base_path, 'lorem_ipsum.txt')
  75. with ChangedWorkingDirectory(self.base_path):
  76. # Sanity ok check.
  77. tar_dir = 'ok_dir'
  78. os.makedirs(os.path.join(self.base_path, tar_dir))
  79. tar = 'good.tar.gz'
  80. lorem_ipsum_copy = os.path.join(tar_dir, 'lorem_ipsum.txt')
  81. shutil.copyfile(lorem_ipsum, lorem_ipsum_copy)
  82. with tarfile.open(tar, 'w:gz') as tar:
  83. tar.add(lorem_ipsum_copy)
  84. self.assertTrue(
  85. download_from_google_storage._validate_tar_file(
  86. tar, tar_dir))
  87. # os.symlink doesn't exist on Windows.
  88. if sys.platform != 'win32':
  89. # Test no links.
  90. tar_dir_link = 'for_tar_link'
  91. os.makedirs(tar_dir_link)
  92. link = os.path.join(tar_dir_link, 'link')
  93. os.symlink(lorem_ipsum, link)
  94. tar_with_links = 'with_links.tar.gz'
  95. with tarfile.open(tar_with_links, 'w:gz') as tar:
  96. tar.add(link)
  97. self.assertFalse(
  98. download_from_google_storage._validate_tar_file(
  99. tar, tar_dir_link))
  100. # Test not outside.
  101. tar_dir_outside = 'outside_tar'
  102. os.makedirs(tar_dir_outside)
  103. tar_with_outside = 'with_outside.tar.gz'
  104. with tarfile.open(tar_with_outside, 'w:gz') as tar:
  105. tar.add(lorem_ipsum)
  106. self.assertFalse(
  107. download_from_google_storage._validate_tar_file(
  108. tar, tar_dir_outside))
  109. # Test no ../
  110. tar_with_dotdot = 'with_dotdot.tar.gz'
  111. dotdot_file = os.path.join(tar_dir, '..', tar_dir,
  112. 'lorem_ipsum.txt')
  113. with tarfile.open(tar_with_dotdot, 'w:gz') as tar:
  114. tar.add(dotdot_file)
  115. self.assertFalse(
  116. download_from_google_storage._validate_tar_file(
  117. tar, tar_dir))
  118. # Test normal file with .. in name okay
  119. tar_with_hidden = 'with_normal_dotdot.tar.gz'
  120. hidden_file = os.path.join(tar_dir, '..hidden_file.txt')
  121. shutil.copyfile(lorem_ipsum, hidden_file)
  122. with tarfile.open(tar_with_hidden, 'w:gz') as tar:
  123. tar.add(hidden_file)
  124. self.assertTrue(
  125. download_from_google_storage._validate_tar_file(
  126. tar, tar_dir))
  127. def test_gsutil(self):
  128. # This will download a real gsutil package from Google Storage.
  129. gsutil = download_from_google_storage.Gsutil(GSUTIL_DEFAULT_PATH, None)
  130. self.assertEqual(gsutil.path, GSUTIL_DEFAULT_PATH)
  131. code, _, err = gsutil.check_call()
  132. self.assertEqual(code, 0, err)
  133. self.assertEqual(err, '')
  134. def test_get_sha1(self):
  135. lorem_ipsum = os.path.join(self.base_path, 'lorem_ipsum.txt')
  136. self.assertEqual(download_from_google_storage.get_sha1(lorem_ipsum),
  137. '7871c8e24da15bad8b0be2c36edc9dc77e37727f')
  138. def test_get_md5(self):
  139. lorem_ipsum = os.path.join(self.base_path, 'lorem_ipsum.txt')
  140. self.assertEqual(upload_to_google_storage.get_md5(lorem_ipsum),
  141. '634d7c1ed3545383837428f031840a1e')
  142. def test_get_md5_cached_read(self):
  143. lorem_ipsum = os.path.join(self.base_path, 'lorem_ipsum.txt')
  144. # Use a fake 'stale' MD5 sum. Expected behavior is to return stale sum.
  145. self.assertEqual(upload_to_google_storage.get_md5_cached(lorem_ipsum),
  146. '734d7c1ed3545383837428f031840a1e')
  147. def test_get_md5_cached_write(self):
  148. lorem_ipsum2 = os.path.join(self.base_path, 'lorem_ipsum2.txt')
  149. lorem_ipsum2_md5 = os.path.join(self.base_path, 'lorem_ipsum2.txt.md5')
  150. if os.path.exists(lorem_ipsum2_md5):
  151. os.remove(lorem_ipsum2_md5)
  152. # Use a fake 'stale' MD5 sum. Expected behavior is to return stale sum.
  153. self.assertEqual(upload_to_google_storage.get_md5_cached(lorem_ipsum2),
  154. '4c02d1eb455a0f22c575265d17b84b6d')
  155. self.assertTrue(os.path.exists(lorem_ipsum2_md5))
  156. self.assertEqual(
  157. open(lorem_ipsum2_md5, 'rb').read().decode(),
  158. '4c02d1eb455a0f22c575265d17b84b6d')
  159. os.remove(lorem_ipsum2_md5) # Clean up.
  160. self.assertFalse(os.path.exists(lorem_ipsum2_md5))
  161. class DownloadTests(unittest.TestCase):
  162. def setUp(self):
  163. self.gsutil = GsutilMock(GSUTIL_DEFAULT_PATH, None)
  164. self.temp_dir = tempfile.mkdtemp(prefix='gstools_test')
  165. self.checkout_test_files = os.path.join(TEST_DIR, 'gstools',
  166. 'download_test_data')
  167. self.base_path = os.path.join(self.temp_dir, 'download_test_data')
  168. shutil.copytree(self.checkout_test_files, self.base_path)
  169. self.base_url = 'gs://sometesturl'
  170. self.parser = optparse.OptionParser()
  171. self.queue = queue.Queue()
  172. self.ret_codes = queue.Queue()
  173. self.lorem_ipsum = os.path.join(TEST_DIR, 'gstools', 'lorem_ipsum.txt')
  174. self.lorem_ipsum_sha1 = '7871c8e24da15bad8b0be2c36edc9dc77e37727f'
  175. self.maxDiff = None
  176. def tearDown(self):
  177. shutil.rmtree(self.temp_dir)
  178. def test_enumerate_files_non_recursive(self):
  179. for item in download_from_google_storage.enumerate_input(
  180. self.base_path, True, False, False, None, False, False):
  181. self.queue.put(item)
  182. expected_queue = [('e6c4fbd4fe7607f3e6ebf68b2ea4ef694da7b4fe',
  183. os.path.join(self.base_path, 'rootfolder_text.txt')),
  184. ('7871c8e24da15bad8b0be2c36edc9dc77e37727f',
  185. os.path.join(self.base_path,
  186. 'uploaded_lorem_ipsum.txt'))]
  187. self.assertEqual(sorted(expected_queue), sorted(self.queue.queue))
  188. def test_enumerate_files_recursive(self):
  189. for item in download_from_google_storage.enumerate_input(
  190. self.base_path, True, True, False, None, False, False):
  191. self.queue.put(item)
  192. expected_queue = [
  193. ('e6c4fbd4fe7607f3e6ebf68b2ea4ef694da7b4fe',
  194. os.path.join(self.base_path, 'rootfolder_text.txt')),
  195. ('7871c8e24da15bad8b0be2c36edc9dc77e37727f',
  196. os.path.join(self.base_path, 'uploaded_lorem_ipsum.txt')),
  197. ('b5415aa0b64006a95c0c409182e628881d6d6463',
  198. os.path.join(self.base_path, 'subfolder', 'subfolder_text.txt')),
  199. ('b5415aa0b64006a95c0c409182e628881d6d6463',
  200. os.path.join(self.base_path, 'subfolder2', 'subfolder_text.txt')),
  201. ]
  202. self.assertEqual(sorted(expected_queue), sorted(self.queue.queue))
  203. def test_download_worker_single_file(self):
  204. sha1_hash = self.lorem_ipsum_sha1
  205. input_filename = '%s/%s' % (self.base_url, sha1_hash)
  206. output_filename = os.path.join(self.base_path,
  207. 'uploaded_lorem_ipsum.txt')
  208. self.gsutil.add_expected(
  209. 0, '', '',
  210. lambda: shutil.copyfile(self.lorem_ipsum, output_filename)) # cp
  211. self.queue.put((sha1_hash, output_filename))
  212. self.queue.put((None, None))
  213. stdout_queue = queue.Queue()
  214. download_from_google_storage._downloader_worker_thread(
  215. 0, self.queue, False, self.base_url, self.gsutil, stdout_queue,
  216. self.ret_codes, True, False)
  217. expected_calls = [('check_call', ('cp', input_filename,
  218. output_filename))]
  219. sha1_hash = '7871c8e24da15bad8b0be2c36edc9dc77e37727f'
  220. if sys.platform != 'win32':
  221. expected_calls.append(
  222. ('check_call', ('stat', 'gs://sometesturl/' + sha1_hash)))
  223. expected_output = [
  224. '0> Downloading %s@%s...' % (output_filename, sha1_hash)
  225. ]
  226. expected_ret_codes = []
  227. self.assertEqual(list(stdout_queue.queue), expected_output)
  228. self.assertEqual(self.gsutil.history, expected_calls)
  229. self.assertEqual(list(self.ret_codes.queue), expected_ret_codes)
  230. def test_download_worker_skips_file(self):
  231. sha1_hash = 'e6c4fbd4fe7607f3e6ebf68b2ea4ef694da7b4fe'
  232. output_filename = os.path.join(self.base_path, 'rootfolder_text.txt')
  233. self.queue.put((sha1_hash, output_filename))
  234. self.queue.put((None, None))
  235. stdout_queue = queue.Queue()
  236. download_from_google_storage._downloader_worker_thread(
  237. 0, self.queue, False, self.base_url, self.gsutil, stdout_queue,
  238. self.ret_codes, True, False)
  239. # dfgs does not output anything in the no-op case.
  240. self.assertEqual(list(stdout_queue.queue), [])
  241. self.assertEqual(self.gsutil.history, [])
  242. def test_download_extract_archive(self):
  243. # Generate a gzipped tarfile
  244. output_filename = os.path.join(self.base_path, 'subfolder.tar.gz')
  245. output_dirname = os.path.join(self.base_path, 'subfolder')
  246. extracted_filename = os.path.join(output_dirname, 'subfolder_text.txt')
  247. with tarfile.open(output_filename, 'w:gz') as tar:
  248. tar.add(output_dirname, arcname='subfolder')
  249. shutil.rmtree(output_dirname)
  250. sha1_hash = download_from_google_storage.get_sha1(output_filename)
  251. input_filename = '%s/%s' % (self.base_url, sha1_hash)
  252. # Initial download
  253. self.queue.put((sha1_hash, output_filename))
  254. self.queue.put((None, None))
  255. stdout_queue = queue.Queue()
  256. download_from_google_storage._downloader_worker_thread(0,
  257. self.queue,
  258. True,
  259. self.base_url,
  260. self.gsutil,
  261. stdout_queue,
  262. self.ret_codes,
  263. True,
  264. True,
  265. delete=False)
  266. expected_calls = [('check_call', ('cp', input_filename,
  267. output_filename))]
  268. if sys.platform != 'win32':
  269. expected_calls.append(
  270. ('check_call', ('stat', 'gs://sometesturl/%s' % sha1_hash)))
  271. expected_output = [
  272. '0> Downloading %s@%s...' % (output_filename, sha1_hash)
  273. ]
  274. expected_output.extend([
  275. '0> Extracting 3 entries from %s to %s' %
  276. (output_filename, output_dirname)
  277. ])
  278. expected_ret_codes = []
  279. self.assertEqual(list(stdout_queue.queue), expected_output)
  280. self.assertEqual(self.gsutil.history, expected_calls)
  281. self.assertEqual(list(self.ret_codes.queue), expected_ret_codes)
  282. self.assertTrue(os.path.exists(output_dirname))
  283. self.assertTrue(os.path.exists(extracted_filename))
  284. # Test noop download
  285. self.queue.put((sha1_hash, output_filename))
  286. self.queue.put((None, None))
  287. stdout_queue = queue.Queue()
  288. download_from_google_storage._downloader_worker_thread(0,
  289. self.queue,
  290. False,
  291. self.base_url,
  292. self.gsutil,
  293. stdout_queue,
  294. self.ret_codes,
  295. True,
  296. True,
  297. delete=False)
  298. self.assertEqual(list(stdout_queue.queue), [])
  299. self.assertEqual(self.gsutil.history, expected_calls)
  300. self.assertEqual(list(self.ret_codes.queue), [])
  301. self.assertTrue(os.path.exists(output_dirname))
  302. self.assertTrue(os.path.exists(extracted_filename))
  303. # With dirty flag file, previous extraction wasn't complete
  304. with open(os.path.join(self.base_path, 'subfolder.tmp'), 'a'):
  305. pass
  306. self.queue.put((sha1_hash, output_filename))
  307. self.queue.put((None, None))
  308. stdout_queue = queue.Queue()
  309. download_from_google_storage._downloader_worker_thread(0,
  310. self.queue,
  311. False,
  312. self.base_url,
  313. self.gsutil,
  314. stdout_queue,
  315. self.ret_codes,
  316. True,
  317. True,
  318. delete=False)
  319. expected_calls += [('check_call', ('cp', input_filename,
  320. output_filename))]
  321. if sys.platform != 'win32':
  322. expected_calls.append(
  323. ('check_call', ('stat', 'gs://sometesturl/%s' % sha1_hash)))
  324. expected_output = [
  325. '0> Detected tmp flag file for %s, re-downloading...' %
  326. (output_filename),
  327. '0> Downloading %s@%s...' % (output_filename, sha1_hash),
  328. '0> Removed %s...' % (output_dirname),
  329. '0> Extracting 3 entries from %s to %s' %
  330. (output_filename, output_dirname),
  331. ]
  332. expected_ret_codes = []
  333. self.assertEqual(list(stdout_queue.queue), expected_output)
  334. self.assertEqual(self.gsutil.history, expected_calls)
  335. self.assertEqual(list(self.ret_codes.queue), expected_ret_codes)
  336. self.assertTrue(os.path.exists(output_dirname))
  337. self.assertTrue(os.path.exists(extracted_filename))
  338. def test_download_worker_skips_not_found_file(self):
  339. sha1_hash = '7871c8e24da15bad8b0be2c36edc9dc77e37727f'
  340. input_filename = '%s/%s' % (self.base_url, sha1_hash)
  341. output_filename = os.path.join(self.base_path,
  342. 'uploaded_lorem_ipsum.txt')
  343. self.queue.put((sha1_hash, output_filename))
  344. self.queue.put((None, None))
  345. stdout_queue = queue.Queue()
  346. self.gsutil.add_expected(1, '', '') # Return error when 'cp' is called.
  347. download_from_google_storage._downloader_worker_thread(
  348. 0, self.queue, False, self.base_url, self.gsutil, stdout_queue,
  349. self.ret_codes, True, False)
  350. expected_output = [
  351. '0> Downloading %s@%s...' % (output_filename, sha1_hash),
  352. '0> Failed to fetch file %s for %s, skipping. [Err: ]' %
  353. (input_filename, output_filename),
  354. ]
  355. expected_calls = [('check_call', ('cp', input_filename,
  356. output_filename))]
  357. expected_ret_codes = [(1, 'Failed to fetch file %s for %s. [Err: ]' %
  358. (input_filename, output_filename))]
  359. self.assertEqual(list(stdout_queue.queue), expected_output)
  360. self.assertEqual(self.gsutil.history, expected_calls)
  361. self.assertEqual(list(self.ret_codes.queue), expected_ret_codes)
  362. def test_download_cp_fails(self):
  363. sha1_hash = '7871c8e24da15bad8b0be2c36edc9dc77e37727f'
  364. input_filename = '%s/%s' % (self.base_url, sha1_hash)
  365. output_filename = os.path.join(self.base_path,
  366. 'uploaded_lorem_ipsum.txt')
  367. self.gsutil.add_expected(101, '', 'Test error message.') # cp
  368. code = download_from_google_storage.download_from_google_storage(
  369. input_filename=sha1_hash,
  370. base_url=self.base_url,
  371. gsutil=self.gsutil,
  372. num_threads=1,
  373. directory=False,
  374. recursive=False,
  375. force=True,
  376. output=output_filename,
  377. ignore_errors=False,
  378. sha1_file=False,
  379. verbose=True,
  380. auto_platform=False,
  381. extract=False)
  382. expected_calls = [('check_call', ('cp', input_filename,
  383. output_filename))]
  384. self.assertEqual(self.gsutil.history, expected_calls)
  385. self.assertEqual(code, 101)
  386. def test_corrupt_download(self):
  387. q = queue.Queue()
  388. out_q = queue.Queue()
  389. ret_codes = queue.Queue()
  390. tmp_dir = tempfile.mkdtemp()
  391. sha1_hash = '7871c8e24da15bad8b0be2c36edc9dc77e37727f'
  392. output_filename = os.path.join(tmp_dir, 'lorem_ipsum.txt')
  393. q.put(('7871c8e24da15bad8b0be2c36edc9dc77e37727f', output_filename))
  394. q.put((None, None))
  395. def _write_bad_file():
  396. with open(output_filename, 'w') as f:
  397. f.write('foobar')
  398. self.gsutil.add_expected(0, '', '', _write_bad_file) # cp
  399. download_from_google_storage._downloader_worker_thread(
  400. 1, q, True, self.base_url, self.gsutil, out_q, ret_codes, True,
  401. False)
  402. self.assertTrue(q.empty())
  403. msg = ('1> ERROR remote sha1 (%s) does not match expected sha1 (%s).' %
  404. ('8843d7f92416211de9ebb963ff4ce28125932878', sha1_hash))
  405. self.assertEqual(
  406. out_q.get(),
  407. '1> Downloading %s@%s...' % (output_filename, sha1_hash))
  408. self.assertEqual(out_q.get(), msg)
  409. self.assertEqual(ret_codes.get(), (20, msg))
  410. self.assertTrue(out_q.empty())
  411. self.assertTrue(ret_codes.empty())
  412. def test_download_directory_no_recursive_non_force(self):
  413. sha1_hash = '7871c8e24da15bad8b0be2c36edc9dc77e37727f'
  414. input_filename = '%s/%s' % (self.base_url, sha1_hash)
  415. output_filename = os.path.join(self.base_path,
  416. 'uploaded_lorem_ipsum.txt')
  417. self.gsutil.add_expected(0, '', '') # version
  418. self.gsutil.add_expected(
  419. 0, '', '',
  420. lambda: shutil.copyfile(self.lorem_ipsum, output_filename)) # cp
  421. code = download_from_google_storage.download_from_google_storage(
  422. input_filename=self.base_path,
  423. base_url=self.base_url,
  424. gsutil=self.gsutil,
  425. num_threads=1,
  426. directory=True,
  427. recursive=False,
  428. force=False,
  429. output=None,
  430. ignore_errors=False,
  431. sha1_file=False,
  432. verbose=True,
  433. auto_platform=False,
  434. extract=False)
  435. expected_calls = [('check_call', ('version', )),
  436. ('check_call', ('cp', input_filename,
  437. output_filename))]
  438. if sys.platform != 'win32':
  439. expected_calls.append(
  440. ('check_call',
  441. ('stat',
  442. 'gs://sometesturl/7871c8e24da15bad8b0be2c36edc9dc77e37727f')))
  443. self.assertEqual(self.gsutil.history, expected_calls)
  444. self.assertEqual(code, 0)
  445. if __name__ == '__main__':
  446. unittest.main()