Skip to content

Inputs API

This module handles the discovery and validation of input sequencing files.

seqnado.inputs.Metadata

Bases: BaseModel

Metadata for samples. Optional fields can be set to None.

set_mcc_defaults

set_mcc_defaults() -> Self

Set default consensus_group for MCC assay.

Source code in seqnado/inputs/core.py
103
104
105
106
107
108
@model_validator(mode='after')
def set_mcc_defaults(self) -> Self:
    """Set default consensus_group for MCC assay."""
    if self.assay == Assay.MCC and self.consensus_group is None:
        self.consensus_group = 'default'
    return self

seqnado.inputs.FastqCollection

Bases: BaseFastqCollection

Represents a collection of sequencing samples (FASTQ files) grouped into named sets, with optional per-sample metadata.

Attributes:

Name Type Description
fastq_sets list[FastqSet]

List of FastqSet objects (paired or single-end samples).

metadata list[Metadata]

List of Metadata objects corresponding one-to-one with fastq_sets.

sample_ids property

sample_ids: list[str]

Returns all sample IDs in the design.

sample_names property

sample_names: list[str]

Returns all sample names in the design.

fastq_paths property

fastq_paths: list[Path]

Flattens all R1/R2 file paths into a single list.

fastq_pairs property

fastq_pairs: dict[str, list[Path]]

Returns a dictionary mapping sample names to their FASTQ file paths.

validate_non_ip_assay classmethod

validate_non_ip_assay(v: Assay) -> Assay

Ensure the assay doesn't require IP (immunoprecipitation).

Source code in seqnado/inputs/fastq.py
361
362
363
364
365
366
367
368
369
@field_validator("assay")
@classmethod
def validate_non_ip_assay(cls, v: Assay) -> Assay:
    """Ensure the assay doesn't require IP (immunoprecipitation)."""
    if v in Assay.ip_assays():
        raise ValueError(
            f"Assay '{v.value}' requires IP and should use IPSampleCollection instead"
        )
    return v

query

query(sample_name: str) -> FastqSet

Retrieve the FastqSet by its sample name.

Raises:

Type Description
ValueError

If sample_name not found.

Source code in seqnado/inputs/fastq.py
418
419
420
421
422
423
424
425
426
427
428
def query(self, sample_name: str) -> FastqSet:
    """
    Retrieve the FastqSet by its sample name.

    Raises:
        ValueError: If sample_name not found.
    """
    try:
        return next(fs for fs in self.fastq_sets if fs.sample_id == sample_name)
    except StopIteration:
        raise ValueError(f"Sample '{sample_name}' not found in SampleCollection")

is_paired_end

is_paired_end(uid: str) -> bool

Check if the given sample ID is paired-end.

Source code in seqnado/inputs/fastq.py
430
431
432
433
434
def is_paired_end(self, uid: str) -> bool:
    """
    Check if the given sample ID is paired-end.
    """
    return self.to_dataframe().loc[uid, "r2"] is not None

from_fastq_files classmethod

from_fastq_files(
    assay: Assay,
    files: Iterable[str | Path],
    metadata: (
        Callable[[str], Metadata] | Metadata | None
    ) = None,
    **fastqset_kwargs: Any
) -> FastqCollection

Build a SampleCollection by scanning a list of FASTQ paths:

  1. Convert raw paths to FastqFile.
  2. Group by sample_base and sort by read_number.
  3. Create FastqSet (single- or paired-end) for each sample.
  4. Generate Metadata via metadata(sample_name), or default.

Parameters:

Name Type Description Default
files Iterable[str | Path]

Iterable of file paths (strings or Path).

required
metadata Callable[[str], Metadata] | Metadata | None
  • Callable(sample_name) → Metadata to customize per-sample metadata.
  • Single Metadata instance applied to all.
  • None → defaults to Metadata().
None
fastqset_kwargs Any

Extra fields forwarded to FastqSet constructor.

{}
Source code in seqnado/inputs/fastq.py
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
@classmethod
def from_fastq_files(
    cls,
    assay: Assay,
    files: Iterable[str | Path],
    metadata: Callable[[str], Metadata] | Metadata | None = None,
    **fastqset_kwargs: Any,
) -> FastqCollection:
    """
    Build a SampleCollection by scanning a list of FASTQ paths:

    1. Convert raw paths to FastqFile.
    2. Group by `sample_base` and sort by read_number.
    3. Create FastqSet (single- or paired-end) for each sample.
    4. Generate Metadata via `metadata(sample_name)`, or default.

    Args:
        files: Iterable of file paths (strings or Path).
        metadata:
            - Callable(sample_name) → Metadata to customize per-sample metadata.
            - Single Metadata instance applied to all.
            - None → defaults to Metadata().
        fastqset_kwargs: Extra fields forwarded to FastqSet constructor.
    """
    # Convert and sort
    fq_files = [FastqFile(path=Path(f)) for f in files]
    fq_files.sort(key=lambda x: (x.sample_base, x.read_number))

    # Group by sample_stem
    groups: dict[str, list[FastqFile]] = defaultdict(list)
    for fq in fq_files:
        groups[fq.sample_base].append(fq)

    _fastq_sets: list[FastqSet] = []
    _metadata: list[Metadata] = []
    for sample, fqs in groups.items():
        # Build FastqSet
        if len(fqs) == 1:
            fs = FastqSet(sample_id=sample, r1=fqs[0], **fastqset_kwargs)
        elif len(fqs) == 2:
            fs = FastqSet(sample_id=sample, r1=fqs[0], r2=fqs[1], **fastqset_kwargs)
        else:
            raise ValueError(
                f"Unexpected number of FASTQ files for '{sample}': {len(fqs)}"
            )
        _fastq_sets.append(fs)

        # Build Metadata using base class method
        _metadata.append(cls._build_metadata(sample, metadata, assay))

    return cls(assay=assay, fastq_sets=_fastq_sets, metadata=_metadata)

from_directory classmethod

from_directory(
    assay: Assay,
    directory: str | Path,
    glob_patterns: Iterable[str] = (
        "*.fq",
        "*.fq.gz",
        "*.fastq",
        "*.fastq.gz",
    ),
    metadata: (
        Callable[[str], Metadata] | Metadata | None
    ) = None,
    **kwargs: Any
) -> FastqCollection

Recursively scan a directory for FASTQ files and build a SampleCollection.

Parameters:

Name Type Description Default
directory str | Path

Root path to search.

required
glob_patterns Iterable[str]

Filename patterns to include.

('*.fq', '*.fq.gz', '*.fastq', '*.fastq.gz')
metadata Callable[[str], Metadata] | Metadata | None

Callable(sample_name) → Metadata or single Metadata instance.

None
**kwargs Any

Extra fields converted directly to a shared Metadata.

{}
Source code in seqnado/inputs/fastq.py
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
@classmethod
def from_directory(
    cls,
    assay: Assay,
    directory: str | Path,
    glob_patterns: Iterable[str] = ("*.fq", "*.fq.gz", "*.fastq", "*.fastq.gz"),
    metadata: Callable[[str], Metadata] | Metadata | None = None,
    **kwargs: Any,
) -> FastqCollection:
    """
    Recursively scan a directory for FASTQ files and build a SampleCollection.

    Args:
        directory: Root path to search.
        glob_patterns: Filename patterns to include.
        metadata: Callable(sample_name) → Metadata or single Metadata instance.
        **kwargs: Extra fields converted directly to a shared Metadata.
    """
    files = cls._discover_files(directory, glob_patterns)
    metadata = cls._prepare_metadata_for_directory(metadata, **kwargs)
    return cls.from_fastq_files(assay=assay, files=files, metadata=metadata)

to_dataframe

to_dataframe(validate: bool = True) -> pd.DataFrame

Export the design to a pandas DataFrame, validated by DataFrameDesign.

Columns: sample_name, r1, r2, plus all metadata fields.

Source code in seqnado/inputs/fastq.py
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
def to_dataframe(self, validate: bool = True) -> pd.DataFrame:
    """
    Export the design to a pandas DataFrame, validated by DataFrameDesign.

    Columns: sample_name, r1, r2, plus all metadata fields.
    """
    import pandas as pd

    rows: list[dict[str, Any]] = []

    if self.metadata:
        for fs, md in zip(self.fastq_sets, self.metadata):
            row: dict[str, Any] = {
                "sample_id": fs.sample_id,
                "r1": fs.r1.path,
                "r2": fs.r2.path if fs.r2 else None,
                "uid": f"{fs.sample_id}",
            }
            metadata_dict = md.model_dump(exclude_none=True)
            # Convert Assay enum to string value for schema validation
            if "assay" in metadata_dict and hasattr(metadata_dict["assay"], "value"):
                metadata_dict["assay"] = metadata_dict["assay"].value
            row.update(metadata_dict)
            rows.append(row)
    else:
        for fs in self.fastq_sets:
            row = {
                "sample_id": fs.sample_id,
                "r1": fs.r1.path,
                "r2": fs.r2.path if fs.r2 else None,
                "uid": f"{fs.sample_id}",
            }
            rows.append(row)

    if not rows:
        # Return empty DataFrame with expected columns
        df = pd.DataFrame(columns=["sample_id", "r1", "r2", "uid"]).set_index("uid")
    else:
        df = pd.DataFrame(rows).sort_values("sample_id").set_index("uid")

    # Define column order: critical columns first (assay, sample info, files), then other metadata
    core_cols = ["assay", "sample_id", "r1", "r2"]
    metadata_cols = [col for col in df.columns if col not in core_cols]
    ordered_cols = core_cols + sorted(metadata_cols)
    df = df[[col for col in ordered_cols if col in df.columns]]

    if validate:
        return DataFrame[DesignDataFrame](df)
    else:
        return df

from_dataframe classmethod

from_dataframe(
    assay: Assay,
    df: DataFrame,
    validate_deseq2: bool = False,
    assay_for_validation: Assay | None = None,
    **fastqset_kwargs: Any
) -> FastqCollection

Build a SampleCollection from a DataFrame, validated by DataFrameDesign.

Expects columns: sample_name, r1, r2, plus any metadata fields.

Parameters:

Name Type Description Default
assay Assay

The assay type

required
df DataFrame

DataFrame with sample metadata

required
validate_deseq2 bool

If True, require deseq2 field to be non-null (for RNA assays)

False
assay_for_validation Assay | None

Assay type to check in validation context

None
**fastqset_kwargs Any

Additional kwargs for FastqSet

{}
Source code in seqnado/inputs/fastq.py
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
@classmethod
def from_dataframe(
    cls, assay: Assay, df: pd.DataFrame, validate_deseq2: bool = False, assay_for_validation: Assay | None = None, **fastqset_kwargs: Any
) -> FastqCollection:
    """
    Build a SampleCollection from a DataFrame, validated by DataFrameDesign.

    Expects columns: sample_name, r1, r2, plus any metadata fields.

    Args:
        assay: The assay type
        df: DataFrame with sample metadata
        validate_deseq2: If True, require deseq2 field to be non-null (for RNA assays)
        assay_for_validation: Assay type to check in validation context
        **fastqset_kwargs: Additional kwargs for FastqSet
    """
    df = DesignDataFrame.validate(df)
    fastq_sets: list[FastqSet] = []
    metadata: list[Metadata] = []
    metadata_fields = set(Metadata.model_fields.keys())

    # Use provided assay_for_validation or fall back to assay
    validation_assay = assay_for_validation or assay

    for rec in df.to_dict(orient="records"):
        # Build FastqSet
        r2_path = rec.get("r2")
        fs = FastqSet(
            sample_id=rec["sample_id"],
            r1=FastqFile(path=rec["r1"]),
            r2=FastqFile(path=r2_path) if pd.notna(r2_path) else None,
            **fastqset_kwargs,
        )
        fastq_sets.append(fs)

        # Collect metadata with validation context
        meta_fields = {k: rec.get(k) for k in metadata_fields if k in rec}
        metadata.append(Metadata.model_validate(meta_fields, context={'validate_deseq2': validate_deseq2, 'assay': validation_assay}))

    return cls(assay=assay, fastq_sets=fastq_sets, metadata=metadata)

seqnado.inputs.BamCollection

Bases: BaseCollection

Collection of BAM files with optional per-sample metadata.

Provides convenience constructors analogous to SampleCollection but without paired-end logic.

from_dataframe classmethod

from_dataframe(
    assay: Assay,
    df: Any,
    validate_deseq2: bool = False,
    assay_for_validation: Assay | None = None,
    **kwargs: Any
) -> BamCollection

Build a BamCollection from a DataFrame.

Expects columns: sample_id, bam, plus any metadata fields.

Parameters:

Name Type Description Default
assay Assay

The assay type

required
df Any

DataFrame with sample metadata

required
validate_deseq2 bool

If True, require deseq2 field to be non-null (for RNA assays)

False
assay_for_validation Assay | None

Assay type to check in validation context

None
**kwargs Any

Additional kwargs

{}
Source code in seqnado/inputs/bam.py
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
@classmethod
def from_dataframe(
    cls, assay: Assay, df: Any, validate_deseq2: bool = False, assay_for_validation: Assay | None = None, **kwargs: Any
) -> BamCollection:
    """Build a BamCollection from a DataFrame.

    Expects columns: sample_id, bam, plus any metadata fields.

    Args:
        assay: The assay type
        df: DataFrame with sample metadata
        validate_deseq2: If True, require deseq2 field to be non-null (for RNA assays)
        assay_for_validation: Assay type to check in validation context
        **kwargs: Additional kwargs
    """
    import pandas as pd

    bam_files: list[BamFile] = []
    metadata: list[Metadata] = []
    metadata_fields = set(Metadata.model_fields.keys())

    # Use provided assay_for_validation or fall back to assay
    validation_assay = assay_for_validation or assay

    for rec in df.to_dict(orient="records"):
        # Build BamFile
        bam_files.append(BamFile(path=Path(rec["bam"])))

        # Collect metadata with validation context
        meta_fields = {k: rec.get(k) for k in metadata_fields if k in rec}
        metadata.append(Metadata.model_validate(meta_fields, context={'validate_deseq2': validate_deseq2, 'assay': validation_assay}))

    return cls(assay=assay, bam_files=bam_files, metadata=metadata)

← Back to API Overview