Open In Colab

Bulk RNA-seq (with differential transcript expression)

Open in Google Colab

Notebook authored by Delaney K. Sullivan.

We ask that if you quantify bulk-RNA-seq with kallisto and use the results in a publication that you please cite the following publications:

  • Bray NL, Pimentel H, Melsted P, Pachter, L. Near-optimal probabilistic RNA-seq quantification. Nat biotechnol., 2016;34(5):525-527.

  • Sullivan DK, Min KH (Joseph), Hjörleifsson KE, Luebbert L, Holley G, Moses L, et al. kallisto, bustools, and kb-python for quantifying bulk, single-cell, and single-nucleus RNA-seq. Nat Protoc. 2025;20:587–607.

Install kb-python

[ ]:
!pip install kb_python
Collecting kb_python
  Downloading kb_python-0.29.5-py3-none-any.whl.metadata (11 kB)
Collecting anndata>=0.9.2 (from kb_python)
  Downloading anndata-0.11.4-py3-none-any.whl.metadata (9.3 kB)
Requirement already satisfied: h5py>=2.10.0 in /usr/local/lib/python3.11/dist-packages (from kb_python) (3.14.0)
Requirement already satisfied: Jinja2>2.10.1 in /usr/local/lib/python3.11/dist-packages (from kb_python) (3.1.6)
Collecting loompy>=3.0.6 (from kb_python)
  Downloading loompy-3.0.8.tar.gz (49 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 49.6/49.6 kB 1.9 MB/s eta 0:00:00
  Preparing metadata (setup.py) ... done
Requirement already satisfied: nbconvert>=5.6.0 in /usr/local/lib/python3.11/dist-packages (from kb_python) (7.16.6)
Requirement already satisfied: nbformat>=4.4.0 in /usr/local/lib/python3.11/dist-packages (from kb_python) (5.10.4)
Collecting ngs-tools>=1.8.6 (from kb_python)
  Downloading ngs_tools-1.8.6-py3-none-any.whl.metadata (2.0 kB)
Requirement already satisfied: numpy>=1.17.2 in /usr/local/lib/python3.11/dist-packages (from kb_python) (2.0.2)
Requirement already satisfied: pandas>=1.5.3 in /usr/local/lib/python3.11/dist-packages (from kb_python) (2.2.2)
Requirement already satisfied: plotly>=4.5.0 in /usr/local/lib/python3.11/dist-packages (from kb_python) (5.24.1)
Requirement already satisfied: requests>=2.22.0 in /usr/local/lib/python3.11/dist-packages (from kb_python) (2.32.3)
Collecting scanpy>=1.4.4.post1 (from kb_python)
  Downloading scanpy-1.11.2-py3-none-any.whl.metadata (9.1 kB)
Requirement already satisfied: scikit-learn>=0.21.3 in /usr/local/lib/python3.11/dist-packages (from kb_python) (1.6.1)
Requirement already satisfied: typing-extensions>=3.7.4 in /usr/local/lib/python3.11/dist-packages (from kb_python) (4.14.0)
Collecting biopython>=1.8 (from kb_python)
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting array-api-compat!=1.5,>1.4 (from anndata>=0.9.2->kb_python)
  Downloading array_api_compat-1.12.0-py3-none-any.whl.metadata (2.5 kB)
Requirement already satisfied: natsort in /usr/local/lib/python3.11/dist-packages (from anndata>=0.9.2->kb_python) (8.4.0)
Requirement already satisfied: packaging>=24.2 in /usr/local/lib/python3.11/dist-packages (from anndata>=0.9.2->kb_python) (24.2)
Requirement already satisfied: scipy>1.8 in /usr/local/lib/python3.11/dist-packages (from anndata>=0.9.2->kb_python) (1.15.3)
Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.11/dist-packages (from Jinja2>2.10.1->kb_python) (3.0.2)
Requirement already satisfied: setuptools in /usr/local/lib/python3.11/dist-packages (from loompy>=3.0.6->kb_python) (75.2.0)
Requirement already satisfied: numba in /usr/local/lib/python3.11/dist-packages (from loompy>=3.0.6->kb_python) (0.60.0)
Requirement already satisfied: click in /usr/local/lib/python3.11/dist-packages (from loompy>=3.0.6->kb_python) (8.2.1)
Collecting numpy-groupies (from loompy>=3.0.6->kb_python)
  Downloading numpy_groupies-0.11.3-py3-none-any.whl.metadata (18 kB)
Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.11/dist-packages (from nbconvert>=5.6.0->kb_python) (4.13.4)
Requirement already satisfied: bleach!=5.0.0 in /usr/local/lib/python3.11/dist-packages (from bleach[css]!=5.0.0->nbconvert>=5.6.0->kb_python) (6.2.0)
Requirement already satisfied: defusedxml in /usr/local/lib/python3.11/dist-packages (from nbconvert>=5.6.0->kb_python) (0.7.1)
Requirement already satisfied: jupyter-core>=4.7 in /usr/local/lib/python3.11/dist-packages (from nbconvert>=5.6.0->kb_python) (5.8.1)
Requirement already satisfied: jupyterlab-pygments in /usr/local/lib/python3.11/dist-packages (from nbconvert>=5.6.0->kb_python) (0.3.0)
Requirement already satisfied: mistune<4,>=2.0.3 in /usr/local/lib/python3.11/dist-packages (from nbconvert>=5.6.0->kb_python) (3.1.3)
Requirement already satisfied: nbclient>=0.5.0 in /usr/local/lib/python3.11/dist-packages (from nbconvert>=5.6.0->kb_python) (0.10.2)
Requirement already satisfied: pandocfilters>=1.4.1 in /usr/local/lib/python3.11/dist-packages (from nbconvert>=5.6.0->kb_python) (1.5.1)
Requirement already satisfied: pygments>=2.4.1 in /usr/local/lib/python3.11/dist-packages (from nbconvert>=5.6.0->kb_python) (2.19.1)
Requirement already satisfied: traitlets>=5.1 in /usr/local/lib/python3.11/dist-packages (from nbconvert>=5.6.0->kb_python) (5.7.1)
Requirement already satisfied: fastjsonschema>=2.15 in /usr/local/lib/python3.11/dist-packages (from nbformat>=4.4.0->kb_python) (2.21.1)
Requirement already satisfied: jsonschema>=2.6 in /usr/local/lib/python3.11/dist-packages (from nbformat>=4.4.0->kb_python) (4.24.0)
Requirement already satisfied: joblib>=1.0.1 in /usr/local/lib/python3.11/dist-packages (from ngs-tools>=1.8.6->kb_python) (1.5.1)
Collecting pysam>=0.16.0.1 (from ngs-tools>=1.8.6->kb_python)
  Downloading pysam-0.23.3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.7 kB)
Collecting shortuuid>=1.0.1 (from ngs-tools>=1.8.6->kb_python)
  Downloading shortuuid-1.0.13-py3-none-any.whl.metadata (5.8 kB)
Requirement already satisfied: tqdm>=4.50.0 in /usr/local/lib/python3.11/dist-packages (from ngs-tools>=1.8.6->kb_python) (4.67.1)
Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.11/dist-packages (from pandas>=1.5.3->kb_python) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas>=1.5.3->kb_python) (2025.2)
Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas>=1.5.3->kb_python) (2025.2)
Requirement already satisfied: tenacity>=6.2.0 in /usr/local/lib/python3.11/dist-packages (from plotly>=4.5.0->kb_python) (9.1.2)
Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests>=2.22.0->kb_python) (3.4.2)
Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests>=2.22.0->kb_python) (3.10)
Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests>=2.22.0->kb_python) (2.4.0)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests>=2.22.0->kb_python) (2025.6.15)
Collecting legacy-api-wrap>=1.4.1 (from scanpy>=1.4.4.post1->kb_python)
  Downloading legacy_api_wrap-1.4.1-py3-none-any.whl.metadata (2.1 kB)
Requirement already satisfied: matplotlib>=3.7.5 in /usr/local/lib/python3.11/dist-packages (from scanpy>=1.4.4.post1->kb_python) (3.10.0)
Requirement already satisfied: networkx>=2.7.1 in /usr/local/lib/python3.11/dist-packages (from scanpy>=1.4.4.post1->kb_python) (3.5)
Requirement already satisfied: patsy!=1.0.0 in /usr/local/lib/python3.11/dist-packages (from scanpy>=1.4.4.post1->kb_python) (1.0.1)
Requirement already satisfied: pynndescent>=0.5.13 in /usr/local/lib/python3.11/dist-packages (from scanpy>=1.4.4.post1->kb_python) (0.5.13)
Requirement already satisfied: seaborn>=0.13.2 in /usr/local/lib/python3.11/dist-packages (from scanpy>=1.4.4.post1->kb_python) (0.13.2)
Collecting session-info2 (from scanpy>=1.4.4.post1->kb_python)
  Downloading session_info2-0.1.2-py3-none-any.whl.metadata (2.5 kB)
Requirement already satisfied: statsmodels>=0.14.4 in /usr/local/lib/python3.11/dist-packages (from scanpy>=1.4.4.post1->kb_python) (0.14.4)
Requirement already satisfied: umap-learn>=0.5.6 in /usr/local/lib/python3.11/dist-packages (from scanpy>=1.4.4.post1->kb_python) (0.5.7)
Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn>=0.21.3->kb_python) (3.6.0)
Requirement already satisfied: webencodings in /usr/local/lib/python3.11/dist-packages (from bleach!=5.0.0->bleach[css]!=5.0.0->nbconvert>=5.6.0->kb_python) (0.5.1)
Requirement already satisfied: tinycss2<1.5,>=1.1.0 in /usr/local/lib/python3.11/dist-packages (from bleach[css]!=5.0.0->nbconvert>=5.6.0->kb_python) (1.4.0)
Requirement already satisfied: attrs>=22.2.0 in /usr/local/lib/python3.11/dist-packages (from jsonschema>=2.6->nbformat>=4.4.0->kb_python) (25.3.0)
Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.11/dist-packages (from jsonschema>=2.6->nbformat>=4.4.0->kb_python) (2025.4.1)
Requirement already satisfied: referencing>=0.28.4 in /usr/local/lib/python3.11/dist-packages (from jsonschema>=2.6->nbformat>=4.4.0->kb_python) (0.36.2)
Requirement already satisfied: rpds-py>=0.7.1 in /usr/local/lib/python3.11/dist-packages (from jsonschema>=2.6->nbformat>=4.4.0->kb_python) (0.25.1)
Requirement already satisfied: platformdirs>=2.5 in /usr/local/lib/python3.11/dist-packages (from jupyter-core>=4.7->nbconvert>=5.6.0->kb_python) (4.3.8)
Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib>=3.7.5->scanpy>=1.4.4.post1->kb_python) (1.3.2)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.11/dist-packages (from matplotlib>=3.7.5->scanpy>=1.4.4.post1->kb_python) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.11/dist-packages (from matplotlib>=3.7.5->scanpy>=1.4.4.post1->kb_python) (4.58.4)
Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib>=3.7.5->scanpy>=1.4.4.post1->kb_python) (1.4.8)
Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.11/dist-packages (from matplotlib>=3.7.5->scanpy>=1.4.4.post1->kb_python) (11.2.1)
Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib>=3.7.5->scanpy>=1.4.4.post1->kb_python) (3.2.3)
Requirement already satisfied: jupyter-client>=6.1.12 in /usr/local/lib/python3.11/dist-packages (from nbclient>=0.5.0->nbconvert>=5.6.0->kb_python) (6.1.12)
Requirement already satisfied: llvmlite<0.44,>=0.43.0dev0 in /usr/local/lib/python3.11/dist-packages (from numba->loompy>=3.0.6->kb_python) (0.43.0)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.11/dist-packages (from python-dateutil>=2.8.2->pandas>=1.5.3->kb_python) (1.17.0)
Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.11/dist-packages (from beautifulsoup4->nbconvert>=5.6.0->kb_python) (2.7)
Requirement already satisfied: pyzmq>=13 in /usr/local/lib/python3.11/dist-packages (from jupyter-client>=6.1.12->nbclient>=0.5.0->nbconvert>=5.6.0->kb_python) (24.0.1)
Requirement already satisfied: tornado>=4.1 in /usr/local/lib/python3.11/dist-packages (from jupyter-client>=6.1.12->nbclient>=0.5.0->nbconvert>=5.6.0->kb_python) (6.4.2)
Downloading kb_python-0.29.5-py3-none-any.whl (36.5 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 36.5/36.5 MB 20.3 MB/s eta 0:00:00
Downloading anndata-0.11.4-py3-none-any.whl (144 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 144.5/144.5 kB 6.9 MB/s eta 0:00:00
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.3/3.3 MB 69.2 MB/s eta 0:00:00
Downloading ngs_tools-1.8.6-py3-none-any.whl (50.9 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 50.9/50.9 MB 12.7 MB/s eta 0:00:00
Downloading scanpy-1.11.2-py3-none-any.whl (2.1 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.1/2.1 MB 18.2 MB/s eta 0:00:00
Downloading array_api_compat-1.12.0-py3-none-any.whl (58 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 58.2/58.2 kB 4.3 MB/s eta 0:00:00
Downloading legacy_api_wrap-1.4.1-py3-none-any.whl (10.0 kB)
Downloading pysam-0.23.3-cp311-cp311-manylinux_2_28_x86_64.whl (26.5 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 26.5/26.5 MB 52.0 MB/s eta 0:00:00
Downloading shortuuid-1.0.13-py3-none-any.whl (10 kB)
Downloading numpy_groupies-0.11.3-py3-none-any.whl (40 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 40.8/40.8 kB 2.5 MB/s eta 0:00:00
Downloading session_info2-0.1.2-py3-none-any.whl (14 kB)
Building wheels for collected packages: loompy
  Building wheel for loompy (setup.py) ... done
  Created wheel for loompy: filename=loompy-3.0.8-py3-none-any.whl size=54012 sha256=8ecdd255dc8928c091b369b81fefc150a84b5cfd88f83c0e6359fa2371d8f709
  Stored in directory: /root/.cache/pip/wheels/0a/65/00/30ea772562c1e326e9a076e6a526216abc6c5456852cc2e0e4
Successfully built loompy
Installing collected packages: shortuuid, session-info2, pysam, numpy-groupies, legacy-api-wrap, biopython, array-api-compat, ngs-tools, loompy, anndata, scanpy, kb_python
Successfully installed anndata-0.11.4 array-api-compat-1.12.0 biopython-1.85 kb_python-0.29.5 legacy-api-wrap-1.4.1 loompy-3.0.8 ngs-tools-1.8.6 numpy-groupies-0.11.3 pysam-0.23.3 scanpy-1.11.2 session-info2-0.1.2 shortuuid-1.0.13

Download datasets

Differential analysis of gene regulation at transcript resolution with RNA-seq by Cole Trapnell, David G Henderickson, Martin Savageau, Loyal Goff, John L Rinn and Lior Pachter, Nature Biotechnology 31, 46–53 (2013).

The control (C) and knockdown (KD) fibroblast samples were subsetted to 1 million reads each.

Control (n=3)

[ ]:
!wget -q https://github.com/pachterlab/data/releases/download/v1/C_1_R1.fastq.gz
!wget -q https://github.com/pachterlab/data/releases/download/v1/C_1_R2.fastq.gz
!wget -q https://github.com/pachterlab/data/releases/download/v1/C_2_R1.fastq.gz
!wget -q https://github.com/pachterlab/data/releases/download/v1/C_2_R2.fastq.gz
!wget -q https://github.com/pachterlab/data/releases/download/v1/C_3_R1.fastq.gz
!wget -q https://github.com/pachterlab/data/releases/download/v1/C_3_R2.fastq.gz

Knockdown (n=3)

[ ]:
!wget -q https://github.com/pachterlab/data/releases/download/v1/KD_1_R1.fastq.gz
!wget -q https://github.com/pachterlab/data/releases/download/v1/KD_1_R2.fastq.gz
!wget -q https://github.com/pachterlab/data/releases/download/v1/KD_2_R1.fastq.gz
!wget -q https://github.com/pachterlab/data/releases/download/v1/KD_2_R2.fastq.gz
!wget -q https://github.com/pachterlab/data/releases/download/v1/KD_3_R1.fastq.gz
!wget -q https://github.com/pachterlab/data/releases/download/v1/KD_3_R2.fastq.gz

Download kallisto human index

[ ]:
!kb ref -d human -i human_index.idx -g human_t2g.txt
[2025-06-19 02:40:51,007]    INFO [download] Downloading files for human (standard workflow) from https://github.com/pachterlab/kallisto-transcriptome-indices/releases/download/v1/human_index_standard.tar.xz to tmp/human_index_standard.tar.xz
100% 138M/138M [00:01<00:00, 134MB/s]
[2025-06-19 02:40:52,095]    INFO [download] Extracting files from tmp/human_index_standard.tar.xz

Map reads to index

Below, we quantify RNA-seq reads using kallisto (via kb count) with bootstrapping (via --bootstraps) to obtain estimates of quantification uncertainty. Here, we set the number of bootstraps to 10.

For the input to kb count, we supply all FASTQ files on the command-line (and the order in which the files are supplied determines the sample identities after read quantification).

Alternatively, we could have created a batch.txt file containing the following:

Control_1   C_1_R1.fastq.gz  C_1_R2.fastq.gz
Control_2   C_2_R1.fastq.gz  C_2_R2.fastq.gz
Control_3   C_3_R1.fastq.gz  C_3_R2.fastq.gz
Knockdown_1 KD_1_R1.fastq.gz KD_1_R2.fastq.gz
Knockdown_2 KD_2_R1.fastq.gz KD_2_R2.fastq.gz
Knockdown_3 KD_3_R1.fastq.gz KD_3_R2.fastq.gz

And then supply that batch.txt file directly to the command in lieu of the individual FASTQ files.

[ ]:
!kb count -x BULK -i human_index.idx -g human_t2g.txt --parity=paired --tcc --matrix-to-directories -o output_dir \
--bootstraps=5 -t 2 --overwrite --verbose \
C_1_R1.fastq.gz C_1_R2.fastq.gz C_2_R1.fastq.gz C_2_R2.fastq.gz C_3_R1.fastq.gz C_3_R2.fastq.gz \
KD_1_R1.fastq.gz KD_1_R2.fastq.gz KD_2_R1.fastq.gz KD_2_R2.fastq.gz KD_3_R1.fastq.gz KD_3_R2.fastq.gz
[2025-06-19 02:41:28,558]   DEBUG [main] Printing verbose output
[2025-06-19 02:41:30,761]   DEBUG [main] kallisto binary located at /usr/local/lib/python3.11/dist-packages/kb_python/bins/linux/kallisto/kallisto
[2025-06-19 02:41:30,761]   DEBUG [main] bustools binary located at /usr/local/lib/python3.11/dist-packages/kb_python/bins/linux/bustools/bustools
[2025-06-19 02:41:30,761]   DEBUG [main] Creating `output_dir/tmp` directory
[2025-06-19 02:41:30,762]   DEBUG [main] Namespace(list=False, command='count', tmp=None, keep_tmp=False, verbose=True, i='human_index.idx', g='human_t2g.txt', x='BULK', o='output_dir', num=False, w=None, exact_barcodes=False, r=None, t=2, m='2G', strand=None, inleaved=False, genomebam=False, aa=False, gtf=None, chromosomes=None, workflow='standard', em=False, mm=False, tcc=True, filter=None, filter_threshold=None, c1=None, c2=None, overwrite=True, dry_run=False, batch_barcodes=False, loom=False, h5ad=False, loom_names='barcode,target_name', sum='none', cellranger=False, gene_names=False, N=None, report=False, no_inspect=False, long=False, threshold=0.8, error_rate=None, platform='ONT', kallisto='/usr/local/lib/python3.11/dist-packages/kb_python/bins/linux/kallisto/kallisto', bustools='/usr/local/lib/python3.11/dist-packages/kb_python/bins/linux/bustools/bustools', opt_off=False, k=31, no_validate=False, no_fragment=False, union=False, no_jump=False, quant_umis=False, keep_flags=False, parity='paired', fragment_l=None, fragment_s=None, bootstraps=5, matrix_to_files=False, matrix_to_directories=True, fastqs=['C_1_R1.fastq.gz', 'C_1_R2.fastq.gz', 'C_2_R1.fastq.gz', 'C_2_R2.fastq.gz', 'C_3_R1.fastq.gz', 'C_3_R2.fastq.gz', 'KD_1_R1.fastq.gz', 'KD_1_R2.fastq.gz', 'KD_2_R1.fastq.gz', 'KD_2_R2.fastq.gz', 'KD_3_R1.fastq.gz', 'KD_3_R2.fastq.gz'])
[2025-06-19 02:41:33,929]    INFO [count] Using index human_index.idx to generate BUS file to output_dir from
[2025-06-19 02:41:33,929]    INFO [count]         C_1_R1.fastq.gz
[2025-06-19 02:41:33,929]    INFO [count]         C_1_R2.fastq.gz
[2025-06-19 02:41:33,929]    INFO [count]         C_2_R1.fastq.gz
[2025-06-19 02:41:33,929]    INFO [count]         C_2_R2.fastq.gz
[2025-06-19 02:41:33,929]    INFO [count]         C_3_R1.fastq.gz
[2025-06-19 02:41:33,929]    INFO [count]         C_3_R2.fastq.gz
[2025-06-19 02:41:33,929]    INFO [count]         KD_1_R1.fastq.gz
[2025-06-19 02:41:33,929]    INFO [count]         KD_1_R2.fastq.gz
[2025-06-19 02:41:33,929]    INFO [count]         KD_2_R1.fastq.gz
[2025-06-19 02:41:33,929]    INFO [count]         KD_2_R2.fastq.gz
[2025-06-19 02:41:33,929]    INFO [count]         KD_3_R1.fastq.gz
[2025-06-19 02:41:33,929]    INFO [count]         KD_3_R2.fastq.gz
[2025-06-19 02:41:33,929]   DEBUG [count] kallisto bus -i human_index.idx -o output_dir -x BULK -t 2 --paired C_1_R1.fastq.gz C_1_R2.fastq.gz C_2_R1.fastq.gz C_2_R2.fastq.gz C_3_R1.fastq.gz C_3_R2.fastq.gz KD_1_R1.fastq.gz KD_1_R2.fastq.gz KD_2_R1.fastq.gz KD_2_R2.fastq.gz KD_3_R1.fastq.gz KD_3_R2.fastq.gz
[2025-06-19 02:41:33,933]   DEBUG [count]
[2025-06-19 02:41:50,491]   DEBUG [count] [index] k-mer length: 31
[2025-06-19 02:41:58,110]   DEBUG [count] [index] number of targets: 227,665
[2025-06-19 02:41:58,110]   DEBUG [count] [index] number of k-mers: 139,900,295
[2025-06-19 02:41:58,110]   DEBUG [count] [index] number of D-list k-mers: 5,477,475
[2025-06-19 02:41:58,210]   DEBUG [count] [quant] running in paired-end mode
[2025-06-19 02:41:58,210]   DEBUG [count] [quant] will process pair 1: C_1_R1.fastq.gz
[2025-06-19 02:41:58,210]   DEBUG [count] C_1_R2.fastq.gz
[2025-06-19 02:41:58,210]   DEBUG [count] [quant] will process pair 1: C_2_R1.fastq.gz
[2025-06-19 02:41:58,210]   DEBUG [count] C_2_R2.fastq.gz
[2025-06-19 02:41:58,210]   DEBUG [count] [quant] will process pair 1: C_3_R1.fastq.gz
[2025-06-19 02:41:58,210]   DEBUG [count] C_3_R2.fastq.gz
[2025-06-19 02:41:58,210]   DEBUG [count] [quant] will process pair 1: KD_1_R1.fastq.gz
[2025-06-19 02:41:58,210]   DEBUG [count] KD_1_R2.fastq.gz
[2025-06-19 02:41:58,211]   DEBUG [count] [quant] will process pair 1: KD_2_R1.fastq.gz
[2025-06-19 02:41:58,211]   DEBUG [count] KD_2_R2.fastq.gz
[2025-06-19 02:41:58,211]   DEBUG [count] [quant] will process pair 1: KD_3_R1.fastq.gz
[2025-06-19 02:41:58,211]   DEBUG [count] KD_3_R2.fastq.gz
[2025-06-19 02:42:36,516]   DEBUG [count] [quant] finding pseudoalignments for all files ...
[2025-06-19 02:43:12,851]   DEBUG [count] [progress] 1M reads processed (92.9% mapped)
[2025-06-19 02:43:47,534]   DEBUG [count] [progress] 2M reads processed (92.9% mapped)
[2025-06-19 02:44:23,935]   DEBUG [count] [progress] 3M reads processed (92.6% mapped)
[2025-06-19 02:45:01,667]   DEBUG [count] [progress] 4M reads processed (92.4% mapped)
[2025-06-19 02:45:31,676]   DEBUG [count] [progress] 5M reads processed (92.4% mapped)              done
[2025-06-19 02:45:31,676]   DEBUG [count] [quant] processed 6,000,000 reads, 5,541,414 reads pseudoaligned
[2025-06-19 02:45:32,177]   DEBUG [count]
[2025-06-19 02:45:34,181]    INFO [count] Sorting BUS file output_dir/output.bus to output_dir/tmp/output.s.bus
[2025-06-19 02:45:34,181]   DEBUG [count] bustools sort -o output_dir/tmp/output.s.bus -T output_dir/tmp -t 2 -m 2G output_dir/output.bus
[2025-06-19 02:45:35,887]   DEBUG [count] partition time: 0.034993s
[2025-06-19 02:45:36,995]   DEBUG [count] all fits in buffer
[2025-06-19 02:45:38,196]   DEBUG [count] Read in 5541414 BUS records
[2025-06-19 02:45:38,196]   DEBUG [count] reading time 0.05138s
[2025-06-19 02:45:38,196]   DEBUG [count] sorting time 1.35408s
[2025-06-19 02:45:38,196]   DEBUG [count] writing time 0.057799s
[2025-06-19 02:45:38,196]    INFO [count] Inspecting BUS file output_dir/tmp/output.s.bus
[2025-06-19 02:45:38,196]   DEBUG [count] bustools inspect -o output_dir/inspect.json output_dir/tmp/output.s.bus
[2025-06-19 02:45:39,298]    INFO [count] Generating count matrix output_dir/counts_unfiltered/cells_x_tcc from BUS file output_dir/tmp/output.s.bus
[2025-06-19 02:45:39,298]   DEBUG [count] bustools count -o output_dir/counts_unfiltered/cells_x_tcc -g human_t2g.txt -e output_dir/matrix.ec -t output_dir/transcripts.txt --multimapping --cm output_dir/tmp/output.s.bus
[2025-06-19 02:45:42,276]   DEBUG [count] output_dir/counts_unfiltered/cells_x_tcc.mtx passed validation
[2025-06-19 02:45:42,276]    INFO [count] Quantifying transcript abundances to output_dir/quant_unfiltered from mtx file output_dir/counts_unfiltered/cells_x_tcc.mtx
[2025-06-19 02:45:42,276]   DEBUG [count] kallisto quant-tcc -o output_dir/quant_unfiltered -i human_index.idx -e output_dir/counts_unfiltered/cells_x_tcc.ec.txt -g human_t2g.txt -t 2 -f output_dir/flens.txt -b 5 --matrix-to-directories output_dir/counts_unfiltered/cells_x_tcc.mtx
[2025-06-19 02:45:42,377]   DEBUG [count]
[2025-06-19 02:45:58,624]   DEBUG [count] [index] k-mer length: 31
[2025-06-19 02:46:07,976]   DEBUG [count] [index] number of targets: 227,665
[2025-06-19 02:46:07,976]   DEBUG [count] [index] number of k-mers: 139,900,295
[2025-06-19 02:46:07,976]   DEBUG [count] [index] number of D-list k-mers: 5,477,475
[2025-06-19 02:46:09,979]   DEBUG [count] [index] number of equivalence classes loaded from file: 404,862
[2025-06-19 02:46:09,979]   DEBUG [count] [index] not using the D-list k-mers
[2025-06-19 02:46:09,979]   DEBUG [count] [tcc] Parsing transcript-compatibility counts (TCC) file as a matrix file
[2025-06-19 02:46:09,979]   DEBUG [count] [tcc] Matrix dimensions: 6 x 404,862
[2025-06-19 02:46:09,979]   DEBUG [count] [tcc] Bootstrapping will be performed and outputted as HDF5
[2025-06-19 02:46:10,881]   DEBUG [count] [quant] Running EM algorithm...
[2025-06-19 02:46:10,881]   DEBUG [count] [quant] Processing sample/cell 0
[2025-06-19 02:46:10,881]   DEBUG [count] [quant] Processing sample/cell 1
[2025-06-19 02:59:55,076]   DEBUG [count] [quant] Processing sample/cell 2
[2025-06-19 02:59:55,076]   DEBUG [count] [quant] Processing sample/cell 3
[2025-06-19 03:14:38,980]   DEBUG [count] [quant] Processing sample/cell formed and outputted as HDF5 a [quant] Processing sample/cell formed and outputted as HDF5 a 5
[2025-06-19 03:14:38,980]   DEBUG [count] 4
[2025-06-19 03:28:33,037]   DEBUG [count] done
[2025-06-19 03:28:33,038]   DEBUG [count]
[2025-06-19 03:28:36,457]   DEBUG [count] output_dir/quant_unfiltered/matrix.abundance.gene.mtx passed validation
[2025-06-19 03:28:36,468]   DEBUG [count] output_dir/quant_unfiltered/matrix.abundance.gene.tpm.mtx passed validation
[2025-06-19 03:28:36,495]   DEBUG [count] output_dir/quant_unfiltered/matrix.abundance.mtx passed validation
[2025-06-19 03:28:36,521]   DEBUG [count] output_dir/quant_unfiltered/matrix.abundance.tpm.mtx passed validation
[2025-06-19 03:28:36,523]   DEBUG [main] Removing `output_dir/tmp` directory

Now inspect the output

There are several “abundance” directories that contain our quantifications. The numbers correspond to the order with which we supplied the FASTQ files to the kb count command (i.e. abundance_1, abundance_2, and abundance_3 are the control samples and abundance_4, abundance_5, and abundance_6 are the knockdown samples)

[ ]:
!ls output_dir/quant_unfiltered
abundance_1  abundance_6                    matrix.abundance.tpm.mtx
abundance_2  genes.txt                      matrix.efflens.mtx
abundance_3  matrix.abundance.gene.mtx      matrix.fld.tsv
abundance_4  matrix.abundance.gene.tpm.mtx  transcript_lengths.txt
abundance_5  matrix.abundance.mtx           transcripts.txt

Differential expression of transcripts

Set up R environment and edgeR

[ ]:
%load_ext rpy2.ipython
[ ]:
%%R
if (!require("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
BiocManager::install("edgeR", ask=FALSE, quiet=TRUE)
BiocManager::install("rhdf5", ask=FALSE, quiet=TRUE)
Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)
trying URL 'https://cran.rstudio.com/src/contrib/BiocManager_1.30.26.tar.gz'
Content type 'application/x-gzip' length 594489 bytes (580 KB)
==================================================
downloaded 580 KB


The downloaded source packages are in
        ‘/tmp/RtmplaovhL/downloaded_packages’
'getOption("repos")' replaces Bioconductor standard repositories, see
'help("repositories", package = "BiocManager")' for details.
Replacement repositories:
    CRAN: https://cran.rstudio.com
Bioconductor version 3.21 (BiocManager 1.30.26), R 4.5.1 (2025-06-13)
Installing package(s) 'BiocVersion', 'edgeR'
also installing the dependencies ‘statmod’, ‘limma’, ‘locfit’

Old packages: 'data.table', 'evaluate'
'getOption("repos")' replaces Bioconductor standard repositories, see
'help("repositories", package = "BiocManager")' for details.
Replacement repositories:
    CRAN: https://cran.rstudio.com
Bioconductor version 3.21 (BiocManager 1.30.26), R 4.5.1 (2025-06-13)
Installing package(s) 'rhdf5'
also installing the dependencies ‘Rhdf5lib’, ‘rhdf5filters’


Use edgeR for differential transcript expression analysis

We list the paths to the abundance directories and then run edgeR on them via catchKallisto. See the follow publication for details on edgeR:

Chen, Y., Chen, L., Lun, A. T., Baldoni, P. L., & Smyth, G. K. (2025). edgeR v4: powerful differential analysis of sequencing data with expanded functionality and improved support for small counts and larger datasets. Nucleic Acids Research, 53(2), gkaf018.

[ ]:
%%R
require(edgeR)
paths <- c("output_dir/quant_unfiltered/abundance_1",
           "output_dir/quant_unfiltered/abundance_2",
           "output_dir/quant_unfiltered/abundance_3",
           "output_dir/quant_unfiltered/abundance_4",
           "output_dir/quant_unfiltered/abundance_5",
           "output_dir/quant_unfiltered/abundance_6")
results <- catchKallisto(paths)
Reading output_dir/quant_unfiltered/abundance_1, 227665 transcripts, 5 bootstraps
Reading output_dir/quant_unfiltered/abundance_2, 227665 transcripts, 5 bootstraps
Reading output_dir/quant_unfiltered/abundance_3, 227665 transcripts, 5 bootstraps
Reading output_dir/quant_unfiltered/abundance_4, 227665 transcripts, 5 bootstraps
Reading output_dir/quant_unfiltered/abundance_5, 227665 transcripts, 5 bootstraps
Reading output_dir/quant_unfiltered/abundance_6, 227665 transcripts, 5 bootstraps
Loading required package: edgeR
Loading required package: limma

[ ]:
%%R
print(results$counts[1:5,,drop=F])
print(length(results$annotation$Overdispersion))
                  output_dir/quant_unfiltered/abundance_1
ENST00000308647.8                               18.119158
ENST00000378736.3                                0.000000
ENST00000472194.6                                3.606886
ENST00000474481.1                                1.487577
ENST00000485748.5                                1.878091
                  output_dir/quant_unfiltered/abundance_2
ENST00000308647.8                              16.3508487
ENST00000378736.3                               0.4174387
ENST00000472194.6                               2.6497518
ENST00000474481.1                               0.0000000
ENST00000485748.5                               0.0000000
                  output_dir/quant_unfiltered/abundance_3
ENST00000308647.8                               17.676045
ENST00000378736.3                                2.851005
ENST00000472194.6                                0.000000
ENST00000474481.1                                0.000000
ENST00000485748.5                                0.000000
                  output_dir/quant_unfiltered/abundance_4
ENST00000308647.8                                9.134442
ENST00000378736.3                                2.485286
ENST00000472194.6                                9.018708
ENST00000474481.1                                0.000000
ENST00000485748.5                                0.000000
                  output_dir/quant_unfiltered/abundance_5
ENST00000308647.8                               13.095408
ENST00000378736.3                                0.000000
ENST00000472194.6                                1.569073
ENST00000474481.1                                1.475829
ENST00000485748.5                                1.834814
                  output_dir/quant_unfiltered/abundance_6
ENST00000308647.8                                5.862184
ENST00000378736.3                                1.859032
ENST00000472194.6                                2.428891
ENST00000474481.1                                0.000000
ENST00000485748.5                                0.000000
[1] 227665
[ ]:
%%R
samples <- c("Control_1", "Control_2", "Control_3", "Knockdown_1", "Knockdown_2", "Knockdown_3")
group <- c("C", "C", "C", "K", "K", "K") # C = control; K = knockdown
[ ]:
%%R
colnames(results$counts) <- samples
cts.scaled <- results$counts/results$annotation$Overdispersion
dge.scaled <- DGEList(counts = cts.scaled, group=group)
[ ]:
%%R
keep <- filterByExpr(dge.scaled)
dge.scaled.filtr <- dge.scaled[keep, , keep.lib.sizes = FALSE]
dge.scaled.filtr <- calcNormFactors(dge.scaled.filtr)
[ ]:
%%R
plotMDS(dge.scaled.filtr)
../../_images/bulk_notebooks_general_30_0.png
[ ]:
%%R
design <- model.matrix(~group-1,data = dge.scaled.filtr$samples)
colnames(design) <- gsub('group','',colnames(design))
dge.scaled.filtr <- estimateDisp(dge.scaled.filtr,design)
plotBCV(dge.scaled.filtr)
../../_images/bulk_notebooks_general_31_0.png
[ ]:
%%R
fit <- glmQLFit(dge.scaled.filtr,design)
plotQLDisp(fit)
../../_images/bulk_notebooks_general_32_0.png
[ ]:
%%R
qlf <- glmQLFTest(fit, contrast = makeContrasts(K-C, levels = design))
tt <- topTags(qlf,n = Inf)
is.de <- decideTests(qlf)
plotMD(qlf, status = is.de, values = c(1, -1), col = c("red","blue"), legend = "topright", cex=0.4)
../../_images/bulk_notebooks_general_33_0.png
[ ]:
%%R
plot(qlf$table$logFC, -1*log10(qlf$table$PValue), main="Volcano plot", xlab="log2FC", ylab="-log10(P-val)")
../../_images/bulk_notebooks_general_34_0.png
[ ]:
%%R
head(qlf$table)
                          logFC   logCPM            F    PValue
ENST00000356607.9  -0.001899721 5.303589 0.0000167155 0.9968000
ENST00000328089.11  0.321175658 5.765121 1.8001385388 0.2026674
ENST00000375592.8  -0.041597613 5.700996 0.0332228001 0.8581821
ENST00000346436.11  0.453813269 4.743627 1.0396290301 0.3265557
ENST00000378512.5  -0.513499461 4.907051 1.4394727424 0.2516714
ENST00000443438.5  -0.109194711 5.451238 0.0663814381 0.8007152