Skip to content

ENH: Lightweight node cache checking #3026

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Sep 11, 2019
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 19 additions & 12 deletions nipype/pipeline/engine/nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,27 +293,29 @@ def is_cached(self, rm_outdated=False):
"""
outdir = self.output_dir()

# Update hash
hashed_inputs, hashvalue = self._get_hashval()

# The output folder does not exist: not cached
if not op.exists(outdir):
logger.debug('[Node] Directory not found "%s".', outdir)
if not op.exists(outdir) or \
not op.exists(op.join(outdir, 'result_%s.pklz' % self.name)):
logger.debug('[Node] Not cached "%s".', outdir)
return False, False

hashfile = op.join(outdir, '_0x%s.json' % hashvalue)
cached = op.exists(hashfile)

# Check if updated
# Check if there are hashfiles
globhashes = glob(op.join(outdir, '_0x*.json'))
unfinished = [
path for path in globhashes
if path.endswith('_unfinished.json')
]
hashfiles = list(set(globhashes) - set(unfinished))

# Update hash
hashed_inputs, hashvalue = self._get_hashval()

hashfile = op.join(outdir, '_0x%s.json' % hashvalue)
logger.debug('[Node] Hashes: %s, %s, %s, %s',
hashed_inputs, hashvalue, hashfile, hashfiles)

cached = hashfile in hashfiles

# No previous hashfiles found, we're all set.
if cached and len(hashfiles) == 1:
assert(hashfile == hashfiles[0])
Expand Down Expand Up @@ -387,17 +389,17 @@ def hash_exists(self, updatehash=False):
return cached, self._hashvalue, hashfile, self._hashed_inputs

def run(self, updatehash=False):
"""Execute the node in its directory.
"""
Execute the node in its directory.

Parameters
----------

updatehash: boolean
When the hash stored in the output directory as a result of a previous run
does not match that calculated for this execution, updatehash=True only
updates the hash without re-running.
"""

"""
if self.config is None:
self.config = {}
self.config = merge_dict(deepcopy(config._sections), self.config)
Expand Down Expand Up @@ -441,6 +443,11 @@ def run(self, updatehash=False):
for outdatedhash in glob(op.join(self.output_dir(), '_0x*.json')):
os.remove(outdatedhash)

# _get_hashval needs to be called before running. When there is a valid (or seemingly
# valid cache), the is_cached() member updates the hashval via _get_hashval.
# However, if this node's folder doesn't exist or the result file is not found, then
# the hashval needs to be generated here. See #3026 for a larger context.
self._get_hashval()
# Hashfile while running
hashfile_unfinished = op.join(
outdir, '_0x%s_unfinished.json' % self._hashvalue)
Expand Down