Natalie Elphick
Bioinformatician I
Yihang Xin (TA)
Software Engineer II
Run the following commands if you did not attend part 1:
mkdir unix_workshop
cd unix_workshop
curl -L -o unix_workshop_2023.tar.gz 'https://www.dropbox.com/s/smb12au2y82jmvq/unix_workshop_2023.tar.gz?dl=0'
tar -xzf unix_workshop_2023.tar.gz
cd unix_workshop_2023
curl -o part_2/homo_sapiens.refseq.tsv.gz https://ftp.ensembl.org/pub/current_tsv/homo_sapiens/Homo_sapiens.GRCh38.109.refseq.tsv.gz
gzip : compresses a file and replaces it with a
compressed version (.gz)tar : create and manipulate archive filesArchive: a single file that contains one or more files and/or folders that have been compressed
gunzip part_2/homo_sapiens.refseq.tsv.gz
du -h part_2/homo_sapiens.refseq.tsv
26M part_2/homo_sapiens.refseq.tsv
gzip part_2/homo_sapiens.refseq.tsv
du -h part_2/homo_sapiens.refseq.tsv.gz
2.7M part_2/homo_sapiens.refseq.tsv.gz
tar -czf part_1.tar.gz part_1
ls -l
total 8
drwx---rw-@ 4 nelphick staff 128 Apr 16 21:10 part_1
-rw-r--r--@ 1 nelphick staff 814 Apr 16 21:11 part_1.tar.gz
drwxr-xr-x@ 3 nelphick staff 96 Apr 16 21:11 part_2
tar -xzf part_1.tar.gz
gunzip -cgunzip -c part_2/homo_sapiens.refseq.tsv.gz | head
gene_stable_id transcript_stable_id protein_stable_id xref db_name info_type source_identity xref_identity linkage_type
ENSG00000160072 ENST00000673477 ENSP00000500094 NP_001304167 RefSeq_peptide INFERRED_PAIR - - -
ENSG00000160072 ENST00000673477 ENSP00000500094 NP_114127 RefSeq_peptide DIRECT 100 100 -
ENSG00000160072 ENST00000673477 ENSP00000500094 NM_001317238 RefSeq_mRNA DIRECT 90 82 -
ENSG00000160072 ENST00000673477 ENSP00000500094 NM_031921 RefSeq_mRNA DIRECT 100 100 -
ENSG00000160072 ENST00000673477 ENSP00000500094 XM_005244806 RefSeq_mRNA_predicted DIRECT 45 94 -
ENSG00000160072 ENST00000673477 ENSP00000500094 XM_011542241 RefSeq_mRNA_predicted DIRECT 35 87 -
ENSG00000160072 ENST00000673477 ENSP00000500094 XM_011542244 RefSeq_mRNA_predicted DIRECT 90 87 -
ENSG00000160072 ENST00000673477 ENSP00000500094 XM_047431593 RefSeq_mRNA_predicted SEQUENCE_MATCH 90 96 -
ENSG00000160072 ENST00000673477 ENSP00000500094 XR_001737468 RefSeq_ncRNA_predicted DIRECT - - -
Example:
echo $HOME
/Users/nelphick
$PATH to find its associated executable fileecho $PATH
/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin:/usr/local/go/bin:/usr/local/mysql/bin
$PATH like this:export PATH="/path/to/new/software:$PATH"
$PATH for the current terminal
session~/.bashrc or
~/.zshrc$PATH incorrectly can break system
functionalitywhich ls
/bin/ls
.shnano part_2/example_script.sh
#!/bin/bash
#! tells the OS where the
interpreter iswhich bash
/bin/bash
ls -l part_2/example_script.sh
-rw-r--r--@ 1 nelphick staff 287 Apr 16 21:11 part_2/example_script.sh
chmod u+x part_2/example_script.sh
ls -l part_2/example_script.sh
-rwxr--r--@ 1 nelphick staff 287 Apr 16 21:11 part_2/example_script.sh
#!/bin/bash
# This is a comment. Comments are ignored by the shell.
# $1 is the first argument passed to the script
echo "Counting the genes in $1"
# count the unique genes in the file
u_genes=$(gunzip -c $1 | cut -f 1 | sort -u | wc -l)
echo "There are $u_genes unique genes in $1"
./part_2/example_script.sh part_2/homo_sapiens.refseq.tsv.gz
Counting the genes in part_2/homo_sapiens.refseq.tsv.gz
There are 32538 unique genes in part_2/homo_sapiens.refseq.tsv.gz
for i in {1..3}
do
echo $i
done
1
2
3
count=0
while [ $count -lt 5 ] # loop while count is less than 5
do
echo $count
count=$((count+1))
done
0
1
2
3
4
x=5
if [ $x -gt 10 ] # check if x is greater than 10
then
echo "x is greater than 10"
else
echo "x is not greater than 10"
fi # end if statement
x is not greater than 10
Example:
sed 's/search_string/replace_string/g' input.txt > output.txt
ssh username@remote
username would be your user on the remote server
and remote is the hostname or IP address of the remote
server or computerscp [options] [source] [destination]
scp /path/to/local/file.txt username@remote:/path/to/remote/directory/
scp username@remote:/path/to/file.txt /path/to/local/directory/
Basic command:
awk options 'pattern {action}' input_file
awk -F '\t' '{print $1+$2}' part_1/list_numbers.tsv
4
15
17
$1,$2 : the first and second fieldsgunzip -c part_2/homo_sapiens.refseq.tsv.gz | \
awk -F '\t' '$5 == "RefSeq_mRNA" {sum += $7; count++} \
END {print sum / count}'
65.4642
Linear
Mixed Effects Modeling
April 24-April 25, 2023 10:00am-12:00pm PDT
Machine
Learning
April 28, 2023 10:00am-12:00pm PDT
Advanced
Cytoscape Automation
May 2, 2023 1:00-4:00pm PDT
Introduction
to RNA-Seq Analysis
May 15-May 16, 2023 9:00am-12:00pm PDT