diff --git a/pyidr/study_parser.py b/pyidr/study_parser.py index 6c5528af..604e778f 100755 --- a/pyidr/study_parser.py +++ b/pyidr/study_parser.py @@ -114,19 +114,32 @@ def __init__(self, study_file): self.study.update(self.parse_data_doi(self.study, "Study Data DOI")) self.components = [] - for t in TYPES: - n = int(self.study.get('Study %ss Number' % t, 0)) - for i in range(n): - self.log.debug("Parsing %s %g" % (t, i + 1)) - - d = self.parse(t, lines=self.get_lines(i + 1, t)) - d.update({'Type': t}) - d.update(self.study) - doi = self.parse_data_doi(d, "%s Data DOI" % t) - if doi: - d.update(doi) - self.parse_annotation_file(d) - self.components.append(d) + + # Find number of screens and experiments + n_screens = int(self.study.get('Study Screens Number', 0)) + n_experiments = int(self.study.get('Study Experiments Number', 0)) + self.log.debug("Expecting %s screen(s) and %s experiment(s)" % + (n_screens, n_experiments)) + if n_screens > 0 and n_experiments > 0: + component_regexp = '(Screen|Experiment)' + elif n_screens > 0: + component_regexp = '(Screen)' + elif n_experiments > 0: + component_regexp = '(Experiment)' + else: + raise Exception("Not enough screens and/or experiments") + + # Find all study components in order + for i in range(n_screens + n_experiments): + lines, component_type = self.get_lines(i + 1, component_regexp) + d = self.parse(component_type, lines=lines) + d.update({'Type': component_type}) + d.update(self.study) + doi = self.parse_data_doi(d, "%s Data DOI" % component_type) + if doi: + d.update(doi) + self.parse_annotation_file(d) + self.components.append(d) if not self.components: raise Exception("Need to define at least one screen or experiment") @@ -161,23 +174,28 @@ def parse(self, scope, lines=None): d[key] = value return d - def get_lines(self, index, component_type): - PATTERN = re.compile(r"^%s Number\t(\d+)" % component_type) + def get_lines(self, index, component_regexp): + self.log.debug("Parsing %s %g" % (component_regexp, index)) + PATTERN = re.compile(r"^%s Number\t(\d+)" % component_regexp) found = False lines = [] + component_type = None for idx, line in enumerate(self._study_lines): m = PATTERN.match(line) if m: - if int(m.group(1)) == index: + if int(m.group(2)) == index and found: + raise Exception("Duplicate component %g" % index) + elif int(m.group(2)) == index and not found: found = True - elif int(m.group(1)) != index and found: - return lines + component_type = m.group(1) + elif int(m.group(2)) != index and found: + return lines, component_type if found: self._study_lines_used[idx].append(("get_lines", index)) lines.append(line) if not lines: - raise Exception("Could not find %s %g" % (component_type, index)) - return lines + raise Exception("Could not find %s %g" % (component_regexp, index)) + return lines, component_type def parse_annotation_file(self, component): import glob @@ -195,11 +213,11 @@ def parse_annotation_file(self, component): # Generate GitHub annotation URL if os.path.exists(os.path.join(self._dir, ".git")): - base_gh_url = "https://github.com/IDR/%s/blob/master/%s" % ( + base_gh_url = "https://github.com/IDR/%s/blob/HEAD/%s" % ( m.group(1), m.group(3)) else: base_gh_url = ( - "https://github.com/IDR/idr-metadata/blob/master/%s" % name) + "https://github.com/IDR/idr-metadata/blob/HEAD/%s" % name) # Try to find single annotation file in root directory for extension in ['.csv', '.csv.gz']: diff --git a/scripts/stats.py b/scripts/stats.py index ba33a49f..f9d528a3 100755 --- a/scripts/stats.py +++ b/scripts/stats.py @@ -103,16 +103,15 @@ def studies(study_list, default_columns=["name", "path"]): continue logging.info("Finding containers for study %s" % study) - target = "Plate" - containers = glob(join(study, "screen[A-Z]")) - if containers: - assert not glob(join(study, "experiment*")), study - else: - target = "Dataset" - containers = glob(join(study, "experiment[A-Z]")) + containers = glob( + join(study, "screen[A-Z]")) + glob(join(study, "experiment[A-Z]")) assert len(containers) >= 1 for container in sorted(containers): + if container.startswith(join(study, "screen")): + target = "Plate" + else: + target = "Dataset" bulks = glob(join(container, "*-bulk.yml")) bulks += glob(join(container, "**/*-bulk.yml")) for bulk in bulks: