[Scons-users] Reducing toposort time for Java builds

Greg Ward greg at gerg.ca
Tue Jul 17 13:48:31 EDT 2012


On 13 July 2012, Greg Ward said:

> On 12 July 2012, Dirk Bächle said:

> >

> > I started to write down the results of my own investigations for the

> > signature subsystem in SCons. Please find it attached and see if it

> > helps you in any way. It's neither complete , nor pretty ;).

>

> Thank you! That was extremely helpful. I've almost got it working.

> Will post my code next week, when it is working.


Got it working. Thanks again for your help. I'll attach my code for
posterity. Moral of the story: it is a *lot* of work to write a custom
Node subclass. It's undocumented. Using the Memoizer module is
implicitly required, and forces ugly cut 'n paste programming. Yuck.

On the upside, a do-nothing build went from ~19-20 sec to ~4-5 sec,
just from reducing the number of nodes in my graph by ~15,000. (We have
~15,000 Java source files, but compile them in ~70 chunks. So really I
removed ~15,000 nodes and added ~70.)

Greg
-------------- next part --------------
'''Provides FileSet, a Node class that lumps a bunch of files together.
Handy for treating collections of Java source as a unit, which makes SCons
run much faster (smaller dependency graph).'''

import os
import errno
import hashlib

from SCons import Errors, Warnings, Node, SConsign, Memoize

class FileSetNodeInfo(Node.NodeInfoBase):
current_version_id = 1
field_list = [
'namesig', # MD5 hash of the list of filenames
'csig', # MD5 hash of the concatenated file contents
]

def str_to_node(self, s):
return FileSet._all[s]

class FileSetBuildInfo(Node.BuildInfoBase):
current_version_id = 1

# bogus: everything depends on BuildInfo subclasses providing these two
# methods, but BuildInfo neither specifies that nor provides default
# implementations

def convert_to_sconsign(self):
pass

def convert_from_sconsign(self, dir, name):
pass

class FileSet(Node.Node):
'''A node representing a collection of files, which are treated as one
for purposes of dependency tracking. This is *not* a composite of File
nodes: that would defeat the whole purpose of this class, which is to
reduce the number of nodes in SCons' graph, thereby improving toposort
performance.'''

_all = {} # map name to FileSet

memoizer_counters = []

NodeInfo = FileSetNodeInfo
BuildInfo = FileSetBuildInfo

def __init__(self, fs, name, filenames):
super(FileSet, self).__init__()
self.fs = fs
self.name = name

# chop the top dir off each filename, so we go from absolute
# filenames to top-relative (makes compiler errors nicer)
top = fs.Top.abspath + os.sep
choplen = len(top)
self.filenames = []
append = self.filenames.append
for fn in filenames:
# If a caller passes in a list of Node objects, that completely
# misses the point of this whole class. This usually happens
# with generated files, and putting generated files into a
# FileSet also screws up content signatures: in a fresh working
# dir, the generated files don't exist yet, so they can't be
# included in the content of the FileSet. Solution: indirectly
# ban generated files from FileSet by requiring filenames only,
# no Node objects.
if not isinstance(fn, basestring):
raise Errors.UserError(
'pass only filenames (strings) to FileSet, not Nodes')
append(fn[choplen:] if fn[0:choplen] == top else fn)

if not filenames:
Warnings.warn(
Warnings.WarningOnByDefault,
'%s: empty filename list' % self.name)

self._all[name] = self

def __str__(self):
return self.name

def __repr__(self):
return '<%s at 0x%0x: %s>' % (self.__class__.__name__, id(self), self)

def log(self, msg, *args):
print('%s(%s): ' % (self.__class__.__name__, self.name) +
msg % args)

def changed_since_last_build(self, target, prev_ni):
# return true if *any* of the files in the fileset were modified
# since the last build of target

try:
prev_namesig = prev_ni.namesig
prev_csig = prev_ni.csig
except AttributeError:
# prev_ni is incomplete or None: this will force us to decide
# the fileset has changed, below
prev_namesig = None
prev_csig = None

#self.log('change_since_last_build: prev_namesig=%s, prev_csig=%s',
# prev_namesig, prev_csig)

# Ignore file timestamps (and sizes) because they're useless.
# Consider:
# case 1:
# all files have the same mtime. It's possible that
# someone modified content quickly (within the same
# second as the previous build), so we need to check
# content to be sure so we don't miss necessary rebuilds.
# case 2:
# some files have different mtime. It's possible that they
# were written without being modified, e.g. someone edited,
# undid the edit, and saved. Or edited and used their VC
# system to undo the edit. We need to check the content to
# avoid unnecessary rebuilds.
# Either way, we need to check the content... so why bother
# looking at mtime or size?

# if the list of files has changed, we don't need to look any
# farther: the node has changed
cur_ni = self.get_ninfo()
if cur_ni.namesig is None:
cur_ni.namesig = self.get_namesig()

if cur_ni.namesig != prev_namesig:
#self.log('namesig changed (new value: %s)', cur_ni.namesig)
return True
if self.get_csig() != prev_csig:
#self.log('csig change (new value: %s)', cur_ni.get_csig())
return True

#self.log('no evidence of change')
return False

memoizer_counters.append(Memoize.CountValue('get_stored_info'))

def get_stored_info(self):
try:
return self._memo['get_stored_info']
except KeyError:
pass

#self.log('get_stored_info: self.ninfo = %r', vars(self.ninfo))

sconsign = self.fs.Top.sconsign()
try:
entry = sconsign.get_entry(self.name)
except KeyError:
entry = SConsign.SConsignEntry()
entry.binfo = self.new_binfo() # huh? why not reuse what we have?
entry.ninfo = self.new_ninfo()

self._memo['get_stored_info'] = entry
return entry

def store_info(self):
sconsign = self.fs.Top.sconsign()
sconsign.store_info(self.name, self)

def visited(self):
#self.log('visited()')
super(FileSet, self).visited()

ninfo = self.get_ninfo()
ninfo.namesig = self.get_namesig()

self.store_info()

# argh! holy cut 'n paste programming Batman! this needs a decorator...

memoizer_counters.append(Memoize.CountValue('get_namesig'))

def get_namesig(self):
try:
return self._memo['get_namesig']
except KeyError:
pass

#self.log('iterating over filenames to calculate namesig')
md5 = hashlib.md5()
for fn in self.filenames:
md5.update(fn)

namesig = md5.hexdigest()
self._memo['get_namesig'] = namesig
return namesig

# WTF? why does this cache in the FileSetNodeInfo object, when
# everything else uses Memoize? (I'm just aping SCons/Node/FS.py)
def get_csig(self):
ninfo = self.get_ninfo()
try:
return ninfo.csig
except AttributeError:
pass

md5 = hashlib.md5()
for fn in self.filenames:
# Assume these are source files, therefore small enough to read
# into memory whole. Also assume they are not generated files,
# i.e. they already exist when processing SConstruct.
with open(fn) as file:
md5.update(file.read())

csig = md5.hexdigest()
ninfo.csig = csig
return csig


More information about the Scons-users mailing list