|
- #!/usr/bin/env python
- """A tool for extracting a list of symbols to export
- When exporting symbols from a dll or exe we either need to mark the symbols in
- the source code as __declspec(dllexport) or supply a list of symbols to the
- linker. This program automates the latter by inspecting the symbol tables of a
- list of link inputs and deciding which of those symbols need to be exported.
- We can't just export all the defined symbols, as there's a limit of 65535
- exported symbols and in clang we go way over that, particularly in a debug
- build. Therefore a large part of the work is pruning symbols either which can't
- be imported, or which we think are things that have definitions in public header
- files (i.e. template instantiations) and we would get defined in the thing
- importing these symbols anyway.
- """
- from __future__ import print_function
- import sys
- import re
- import os
- import subprocess
- import multiprocessing
- import argparse
- # Define functions which extract a list of symbols from a library using several
- # different tools. We use subprocess.Popen and yield a symbol at a time instead
- # of using subprocess.check_output and returning a list as, especially on
- # Windows, waiting for the entire output to be ready can take a significant
- # amount of time.
- def dumpbin_get_symbols(lib):
- process = subprocess.Popen(['dumpbin','/symbols',lib], bufsize=1,
- stdout=subprocess.PIPE, stdin=subprocess.PIPE,
- universal_newlines=True)
- process.stdin.close()
- for line in process.stdout:
- # Look for external symbols that are defined in some section
- match = re.match("^.+SECT.+External\s+\|\s+(\S+).*$", line)
- if match:
- yield match.group(1)
- process.wait()
- def nm_get_symbols(lib):
- process = subprocess.Popen(['nm',lib], bufsize=1,
- stdout=subprocess.PIPE, stdin=subprocess.PIPE,
- universal_newlines=True)
- process.stdin.close()
- for line in process.stdout:
- # Look for external symbols that are defined in some section
- match = re.match("^\S+\s+[BDGRSTVW]\s+(\S+)$", line)
- if match:
- yield match.group(1)
- process.wait()
- def readobj_get_symbols(lib):
- process = subprocess.Popen(['llvm-readobj','-symbols',lib], bufsize=1,
- stdout=subprocess.PIPE, stdin=subprocess.PIPE,
- universal_newlines=True)
- process.stdin.close()
- for line in process.stdout:
- # When looking through the output of llvm-readobj we expect to see Name,
- # Section, then StorageClass, so record Name and Section when we see
- # them and decide if this is a defined external symbol when we see
- # StorageClass.
- match = re.search('Name: (\S+)', line)
- if match:
- name = match.group(1)
- match = re.search('Section: (\S+)', line)
- if match:
- section = match.group(1)
- match = re.search('StorageClass: (\S+)', line)
- if match:
- storageclass = match.group(1)
- if section != 'IMAGE_SYM_ABSOLUTE' and \
- section != 'IMAGE_SYM_UNDEFINED' and \
- storageclass == 'External':
- yield name
- process.wait()
- # Define functions which determine if the target is 32-bit Windows (as that's
- # where calling convention name decoration happens).
- def dumpbin_is_32bit_windows(lib):
- # dumpbin /headers can output a huge amount of data (>100MB in a debug
- # build) so we read only up to the 'machine' line then close the output.
- process = subprocess.Popen(['dumpbin','/headers',lib], bufsize=1,
- stdout=subprocess.PIPE, stdin=subprocess.PIPE,
- universal_newlines=True)
- process.stdin.close()
- retval = False
- for line in process.stdout:
- match = re.match('.+machine \((\S+)\)', line)
- if match:
- retval = (match.group(1) == 'x86')
- break
- process.stdout.close()
- process.wait()
- return retval
- def objdump_is_32bit_windows(lib):
- output = subprocess.check_output(['objdump','-f',lib],
- universal_newlines=True)
- for line in output:
- match = re.match('.+file format (\S+)', line)
- if match:
- return (match.group(1) == 'pe-i386')
- return False
- def readobj_is_32bit_windows(lib):
- output = subprocess.check_output(['llvm-readobj','-file-headers',lib],
- universal_newlines=True)
- for line in output:
- match = re.match('Format: (\S+)', line)
- if match:
- return (match.group(1) == 'COFF-i386')
- return False
- # MSVC mangles names to ?<identifier_mangling>@<type_mangling>. By examining the
- # identifier/type mangling we can decide which symbols could possibly be
- # required and which we can discard.
- def should_keep_microsoft_symbol(symbol, calling_convention_decoration):
- # Keep unmangled (i.e. extern "C") names
- if not '?' in symbol:
- if calling_convention_decoration:
- # Remove calling convention decoration from names
- match = re.match('[_@]([^@]+)', symbol)
- if match:
- return match.group(1)
- return symbol
- # Function template instantiations start with ?$; keep the instantiations of
- # clang::Type::getAs, as some of them are explipict specializations that are
- # defined in clang's lib/AST/Type.cpp; discard the rest as it's assumed that
- # the definition is public
- elif re.match('\?\?\$getAs@.+@Type@clang@@', symbol):
- return symbol
- elif symbol.startswith('??$'):
- return None
- # Deleting destructors start with ?_G or ?_E and can be discarded because
- # link.exe gives you a warning telling you they can't be exported if you
- # don't
- elif symbol.startswith('??_G') or symbol.startswith('??_E'):
- return None
- # Constructors (?0) and destructors (?1) of templates (?$) are assumed to be
- # defined in headers and not required to be kept
- elif symbol.startswith('??0?$') or symbol.startswith('??1?$'):
- return None
- # An anonymous namespace is mangled as ?A(maybe hex number)@. Any symbol
- # that mentions an anonymous namespace can be discarded, as the anonymous
- # namespace doesn't exist outside of that translation unit.
- elif re.search('\?A(0x\w+)?@', symbol):
- return None
- # Keep mangled llvm:: and clang:: function symbols. How we detect these is a
- # bit of a mess and imprecise, but that avoids having to completely demangle
- # the symbol name. The outermost namespace is at the end of the identifier
- # mangling, and the identifier mangling is followed by the type mangling, so
- # we look for (llvm|clang)@@ followed by something that looks like a
- # function type mangling. To spot a function type we use (this is derived
- # from clang/lib/AST/MicrosoftMangle.cpp):
- # <function-type> ::= <function-class> <this-cvr-qualifiers>
- # <calling-convention> <return-type>
- # <argument-list> <throw-spec>
- # <function-class> ::= [A-Z]
- # <this-cvr-qualifiers> ::= [A-Z0-9_]*
- # <calling-convention> ::= [A-JQ]
- # <return-type> ::= .+
- # <argument-list> ::= X (void)
- # ::= .+@ (list of types)
- # ::= .*Z (list of types, varargs)
- # <throw-spec> ::= exceptions are not allowed
- elif re.search('(llvm|clang)@@[A-Z][A-Z0-9_]*[A-JQ].+(X|.+@|.*Z)$', symbol):
- return symbol
- return None
- # Itanium manglings are of the form _Z<identifier_mangling><type_mangling>. We
- # demangle the identifier mangling to identify symbols that can be safely
- # discarded.
- def should_keep_itanium_symbol(symbol, calling_convention_decoration):
- # Start by removing any calling convention decoration (which we expect to
- # see on all symbols, even mangled C++ symbols)
- if calling_convention_decoration and symbol.startswith('_'):
- symbol = symbol[1:]
- # Keep unmangled names
- if not symbol.startswith('_') and not symbol.startswith('.'):
- return symbol
- # Discard manglings that aren't nested names
- match = re.match('_Z(T[VTIS])?(N.+)', symbol)
- if not match:
- return None
- # Demangle the name. If the name is too complex then we don't need to keep
- # it, but it the demangling fails then keep the symbol just in case.
- try:
- names, _ = parse_itanium_nested_name(match.group(2))
- except TooComplexName:
- return None
- if not names:
- return symbol
- # Constructors and destructors of templates classes are assumed to be
- # defined in headers and not required to be kept
- if re.match('[CD][123]', names[-1][0]) and names[-2][1]:
- return None
- # Keep the instantiations of clang::Type::getAs, as some of them are
- # explipict specializations that are defined in clang's lib/AST/Type.cpp;
- # discard any other function template instantiations as it's assumed that
- # the definition is public
- elif symbol.startswith('_ZNK5clang4Type5getAs'):
- return symbol
- elif names[-1][1]:
- return None
- # Keep llvm:: and clang:: names
- elif names[0][0] == '4llvm' or names[0][0] == '5clang':
- return symbol
- # Discard everything else
- else:
- return None
- # Certain kinds of complex manglings we assume cannot be part of a public
- # interface, and we handle them by raising an exception.
- class TooComplexName(Exception):
- pass
- # Parse an itanium mangled name from the start of a string and return a
- # (name, rest of string) pair.
- def parse_itanium_name(arg):
- # Check for a normal name
- match = re.match('(\d+)(.+)', arg)
- if match:
- n = int(match.group(1))
- name = match.group(1)+match.group(2)[:n]
- rest = match.group(2)[n:]
- return name, rest
- # Check for constructor/destructor names
- match = re.match('([CD][123])(.+)', arg)
- if match:
- return match.group(1), match.group(2)
- # Assume that a sequence of characters that doesn't end a nesting is an
- # operator (this is very imprecise, but appears to be good enough)
- match = re.match('([^E]+)(.+)', arg)
- if match:
- return match.group(1), match.group(2)
- # Anything else: we can't handle it
- return None, arg
- # Parse an itanium mangled template argument list from the start of a string
- # and throw it away, returning the rest of the string.
- def skip_itanium_template(arg):
- # A template argument list starts with I
- assert arg.startswith('I'), arg
- tmp = arg[1:]
- while tmp:
- # Check for names
- match = re.match('(\d+)(.+)', tmp)
- if match:
- n = int(match.group(1))
- tmp = match.group(2)[n:]
- continue
- # Check for substitutions
- match = re.match('S[A-Z0-9]*_(.+)', tmp)
- if match:
- tmp = match.group(1)
- # Start of a template
- elif tmp.startswith('I'):
- tmp = skip_itanium_template(tmp)
- # Start of a nested name
- elif tmp.startswith('N'):
- _, tmp = parse_itanium_nested_name(tmp)
- # Start of an expression: assume that it's too complicated
- elif tmp.startswith('L') or tmp.startswith('X'):
- raise TooComplexName
- # End of the template
- elif tmp.startswith('E'):
- return tmp[1:]
- # Something else: probably a type, skip it
- else:
- tmp = tmp[1:]
- return None
- # Parse an itanium mangled nested name and transform it into a list of pairs of
- # (name, is_template), returning (list, rest of string).
- def parse_itanium_nested_name(arg):
- # A nested name starts with N
- assert arg.startswith('N'), arg
- ret = []
- # Skip past the N, and possibly a substitution
- match = re.match('NS[A-Z0-9]*_(.+)', arg)
- if match:
- tmp = match.group(1)
- else:
- tmp = arg[1:]
- # Skip past CV-qualifiers and ref qualifiers
- match = re.match('[rVKRO]*(.+)', tmp);
- if match:
- tmp = match.group(1)
- # Repeatedly parse names from the string until we reach the end of the
- # nested name
- while tmp:
- # An E ends the nested name
- if tmp.startswith('E'):
- return ret, tmp[1:]
- # Parse a name
- name_part, tmp = parse_itanium_name(tmp)
- if not name_part:
- # If we failed then we don't know how to demangle this
- return None, None
- is_template = False
- # If this name is a template record that, then skip the template
- # arguments
- if tmp.startswith('I'):
- tmp = skip_itanium_template(tmp)
- is_template = True
- # Add the name to the list
- ret.append((name_part, is_template))
- # If we get here then something went wrong
- return None, None
- def extract_symbols(arg):
- get_symbols, should_keep_symbol, calling_convention_decoration, lib = arg
- symbols = dict()
- for symbol in get_symbols(lib):
- symbol = should_keep_symbol(symbol, calling_convention_decoration)
- if symbol:
- symbols[symbol] = 1 + symbols.setdefault(symbol,0)
- return symbols
- if __name__ == '__main__':
- tool_exes = ['dumpbin','nm','objdump','llvm-readobj']
- parser = argparse.ArgumentParser(
- description='Extract symbols to export from libraries')
- parser.add_argument('--mangling', choices=['itanium','microsoft'],
- required=True, help='expected symbol mangling scheme')
- parser.add_argument('--tools', choices=tool_exes, nargs='*',
- help='tools to use to extract symbols and determine the'
- ' target')
- parser.add_argument('libs', metavar='lib', type=str, nargs='+',
- help='libraries to extract symbols from')
- parser.add_argument('-o', metavar='file', type=str, help='output to file')
- args = parser.parse_args()
- # Determine the function to use to get the list of symbols from the inputs,
- # and the function to use to determine if the target is 32-bit windows.
- tools = { 'dumpbin' : (dumpbin_get_symbols, dumpbin_is_32bit_windows),
- 'nm' : (nm_get_symbols, None),
- 'objdump' : (None, objdump_is_32bit_windows),
- 'llvm-readobj' : (readobj_get_symbols, readobj_is_32bit_windows) }
- get_symbols = None
- is_32bit_windows = None
- # If we have a tools argument then use that for the list of tools to check
- if args.tools:
- tool_exes = args.tools
- # Find a tool to use by trying each in turn until we find one that exists
- # (subprocess.call will throw OSError when the program does not exist)
- get_symbols = None
- for exe in tool_exes:
- try:
- # Close std streams as we don't want any output and we don't
- # want the process to wait for something on stdin.
- p = subprocess.Popen([exe], stdout=subprocess.PIPE,
- stderr=subprocess.PIPE,
- stdin=subprocess.PIPE,
- universal_newlines=True)
- p.stdout.close()
- p.stderr.close()
- p.stdin.close()
- p.wait()
- # Keep going until we have a tool to use for both get_symbols and
- # is_32bit_windows
- if not get_symbols:
- get_symbols = tools[exe][0]
- if not is_32bit_windows:
- is_32bit_windows = tools[exe][1]
- if get_symbols and is_32bit_windows:
- break
- except OSError:
- continue
- if not get_symbols:
- print("Couldn't find a program to read symbols with", file=sys.stderr)
- exit(1)
- if not is_32bit_windows:
- print("Couldn't find a program to determining the target", file=sys.stderr)
- exit(1)
- # How we determine which symbols to keep and which to discard depends on
- # the mangling scheme
- if args.mangling == 'microsoft':
- should_keep_symbol = should_keep_microsoft_symbol
- else:
- should_keep_symbol = should_keep_itanium_symbol
- # Get the list of libraries to extract symbols from
- libs = list()
- for lib in args.libs:
- # When invoked by cmake the arguments are the cmake target names of the
- # libraries, so we need to add .lib/.a to the end and maybe lib to the
- # start to get the filename. Also allow objects.
- suffixes = ['.lib','.a','.obj','.o']
- if not any([lib.endswith(s) for s in suffixes]):
- for s in suffixes:
- if os.path.exists(lib+s):
- lib = lib+s
- break
- if os.path.exists('lib'+lib+s):
- lib = 'lib'+lib+s
- break
- if not any([lib.endswith(s) for s in suffixes]):
- print("Don't know what to do with argument "+lib, file=sys.stderr)
- exit(1)
- libs.append(lib)
- # Check if calling convention decoration is used by inspecting the first
- # library in the list
- calling_convention_decoration = is_32bit_windows(libs[0])
- # Extract symbols from libraries in parallel. This is a huge time saver when
- # doing a debug build, as there are hundreds of thousands of symbols in each
- # library.
- pool = multiprocessing.Pool()
- try:
- # Only one argument can be passed to the mapping function, and we can't
- # use a lambda or local function definition as that doesn't work on
- # windows, so create a list of tuples which duplicates the arguments
- # that are the same in all calls.
- vals = [(get_symbols, should_keep_symbol, calling_convention_decoration, x) for x in libs]
- # Do an async map then wait for the result to make sure that
- # KeyboardInterrupt gets caught correctly (see
- # http://bugs.python.org/issue8296)
- result = pool.map_async(extract_symbols, vals)
- pool.close()
- libs_symbols = result.get(3600)
- except KeyboardInterrupt:
- # On Ctrl-C terminate everything and exit
- pool.terminate()
- pool.join()
- exit(1)
- # Merge everything into a single dict
- symbols = dict()
- for this_lib_symbols in libs_symbols:
- for k,v in list(this_lib_symbols.items()):
- symbols[k] = v + symbols.setdefault(k,0)
- # Count instances of member functions of template classes, and map the
- # symbol name to the function+class. We do this under the assumption that if
- # a member function of a template class is instantiated many times it's
- # probably declared in a public header file.
- template_function_count = dict()
- template_function_mapping = dict()
- template_function_count[""] = 0
- for k in symbols:
- name = None
- if args.mangling == 'microsoft':
- # Member functions of templates start with
- # ?<fn_name>@?$<class_name>@, so we map to <fn_name>@?$<class_name>.
- # As manglings go from the innermost scope to the outermost scope
- # this means:
- # * When we have a function member of a subclass of a template
- # class then <fn_name> will actually contain the mangling of
- # both the subclass and the function member. This is fine.
- # * When we have a function member of a template subclass of a
- # (possibly template) class then it's the innermost template
- # subclass that becomes <class_name>. This should be OK so long
- # as we don't have multiple classes with a template subclass of
- # the same name.
- match = re.search("^\?(\??\w+\@\?\$\w+)\@", k)
- if match:
- name = match.group(1)
- else:
- # Find member functions of templates by demangling the name and
- # checking if the second-to-last name in the list is a template.
- match = re.match('_Z(T[VTIS])?(N.+)', k)
- if match:
- try:
- names, _ = parse_itanium_nested_name(match.group(2))
- if names and names[-2][1]:
- name = ''.join([x for x,_ in names])
- except TooComplexName:
- # Manglings that are too complex should already have been
- # filtered out, but if we happen to somehow see one here
- # just leave it as-is.
- pass
- if name:
- old_count = template_function_count.setdefault(name,0)
- template_function_count[name] = old_count + 1
- template_function_mapping[k] = name
- else:
- template_function_mapping[k] = ""
- # Print symbols which both:
- # * Appear in exactly one input, as symbols defined in multiple
- # objects/libraries are assumed to have public definitions.
- # * Aren't instances of member functions of templates which have been
- # instantiated 100 times or more, which are assumed to have public
- # definitions. (100 is an arbitrary guess here.)
- if args.o:
- outfile = open(args.o,'w')
- else:
- outfile = sys.stdout
- for k,v in list(symbols.items()):
- template_count = template_function_count[template_function_mapping[k]]
- if v == 1 and template_count < 100:
- print(k, file=outfile)
|