|
|
|
APPENDIX F Mapping the unique UniGene clusters to
the human genome Split the unique UniGene clusters into individual
FASTA files Each
unique UniGene cluster was mapped to the human genome. Doing so exploited all of the sequence
information within the each cluster and allowed us to control false
positives. Whenever a repeat was
detected in a UniGene cluster that did not map within 10 kb of the repeat’s
chromosomal co-ordinates, it was not evaluated further. The Hs.seq.uniq file contains a single
sequence representing the longest, highest quality stretch of # create a new directory $ mkdir ugc_unique # parse Hs.seq.uniq # use original Hs.seq.uniq, not Hs.seq.uniq2 from
Appendix E # parse_unigene_unique.pl will format output files
correctly $ parse_unigene_unique.pl Hs.seq.uniq2 ########################### # parse_unigene_unique.pl # ########################### #!/usr/bin/perl -w # parse_unigene_unique.pl use strict; my $outputfile = "frig"; while (<>) { if
(/^.*(Hs\#\S+).*\/ug\=(\S+).*$/) {
if ($outputfile eq "frig") {
$outputfile = "$2";
unless ( open(SEQ, ">$outputfile") ) {
die "Cannot open file \"$outputfile\" to write
to!\n\n";
}
print SEQ ">$2\|$1\n";
} elsif ($outputfile ne "frig") { close(SEQ);
$outputfile = "$2";
unless ( open(SEQ, ">$outputfile") ) { die
"Cannot open file \"$outputfile\" to write to!\n\n"; } print SEQ
">$2\|$1\n";
} }
elsif (/\S+/) {
print SEQ "$&\n"; } } close(SEQ); exit; Set-up a BLAT server of all human chromosomes Create a
BLAT server using all the human chromosomes from UCSC. Soft mask (-mask) the sequences so that
repeats are not allowed to initiate an alignment but can be used to extend an
alignment. /home/perseusm/blat/gfServer -mask -canStop start
0of0 8050 /home/perseusm/goldenpath/*.nib Run mapugc.pl on each unique UniGene sequence # use the following shell script to run perl script
on each unique UniGene sequence for file in /home/unigene/ugc_unique/Hs.*; do ./mapugc.pl $file; done; Warning: You
need to manually set the gfClient command in the following script to match your
host and port settings from the gfServer command run to set-up the BLAT server. The $cutoff variable is set to 85%, that is, only
BLAT scores that are 85% of the maximum (calculated for a perfect hit) are
input into the database. ############# # mapugc.pl # #!/usr/bin/perl # mapugc.pl UniGene_Cluster.fa # use a shell for file in Hs.* loop to parse each
individual UniGene fa file # blats each unique unigene sequence again human
genome # if score is at least 85% of the theoretical max
(i.e. all bases match) and 90% of input bases match somewhere, it qualifies as
a hit # use this script to fish out chromosome
co-ordinates for unigene clusters # make sure /home/unigene/ugc_unique/ has no *.out files
in it before running this program # find -name "*.out" -print0 | xargs -0 ls # find -name "*.out" -print0 | xargs -0 rm
–f # make sure BLAT gfClient host and port match your
gfServer settings use DBI; use strict; # DBI my ($dsn) = "DBI:mysql:schz_db:athena.bcgsc.ca"; my ($user_name) = "schz_rw"; my ($password) = "repeat"; my ($dbh, $sth); my (@ary);
####################### # Connect to
Database #
####################### $dbh =
DBI->connect ($dsn, $user_name, $password, $sth =
$dbh->prepare ("INSERT INTO unigene VALUES(NULL,?,?,?,?,?,?)"); my $blat_file = $ARGV[0]; print "file to blat is $blat_file\n\n"; my $total; unless ( open( die
"Cannot open file \"$blat_file\" to write to!\n\n"; } while (< chomp; if
(/^(\w+)$/) {
my $seq = $1;
$seq =~ s/\s+//g;
my $length = length($seq);
$total += $length; } } my $cutoff = 0.85*$total; print "$total\n"; print "cutoff is $cutoff\n\n"; close( print "BLAT query\n\n"; print `/home/perseusm/blat/gfClient
o0001 8050 / $blat_file $blat_file.out`; my $blat_hits = "$blat_file.out"; print
"open this file: $blat_hits\n\n"; close( unless (
open( die
"Cannot open file \"$blat_hits\" to write to!\n\n"; } while
(< if
(/^(\d+)\s+(\d+)\s+\d+\s+\d+\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+\S+\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+\S+\s+(\S+)\s+(\S+)/)
{ print
" 1: $1 2: $2 3: $3 4: $4 5: $5 6: $6 7: $7 8: $8 9: $9 10: $10 11: $11 12: $12 13: $13 \n\n"; my
$match = $1; my
$mismatch = $2; my
$qGapCount = $3; my
$qGapBases = $4; my
$tGapCount = $5; my
$tGapBases = $6; my
$query = $7; my
$qSize = $8; my
$qStart = $9; my
$qEnd = $10;
my $chr = $11;
my $start = $12;
my $end = $13; my
$ug_cluster = $7; my
$ug_sequence = $ug_cluster;
$ug_cluster =~
s/^(Hs\.\S+)\|.*/\1/;
$ug_sequence =~
s/^.*\|(Hs\#.*)/\1/; my
$blatScore = $match - $mismatch - $qGapCount - $tGapCount; my
$identity = ($match)/($match + $mismatch + $qGapCount);
$chr =~ s/chr//g;
$identity = $identity * 100;
$identity = sprintf "%.1f",$identity; print "this
is the cluster: $ug_cluster\n"; print
"this is the sequence: $ug_sequence\n"; print
"maps to $chr\:$start\-$end in the human genome\n"; print
"identity: $identity score: $blatScore\n\n"; if
(($blatScore > $cutoff) && ($identity > 90)) {
print "it's a keeper\n"; print
"this is the cluster: $ug_cluster\n"; print
"this is the sequence: $ug_sequence\n"; print
"maps to $chr\:$start\-$end in the human genome\n"; print
"identity: $identity score: $blatScore\n\n";
# insert hit into database
$sth->execute ($ug_cluster,$chr,$start,$end,$blatScore,$identity);
} # system("rm $blat_file.out
-f"); } } exit; ^ top
|
|
|