############# # mapugc.pl # ############# #!/usr/bin/perl # mapugc.pl UniGene_Cluster.fa # use a shell for file in Hs.* loop to parse each individual UniGene fa file # blats each unique unigene sequence again human genome # if score is at least 85% of the theoretical max (i.e. all bases match) and 90% of input bases match somewhere, it qualifies as a hit # use this script to fish out chromosome co-ordinates for unigene clusters # make sure /home/unigene/ugc_unique/ has no *.out files in it before running this program # find -name "*.out" -print0 | xargs -0 ls # find -name "*.out" -print0 | xargs -0 rm –f # make sure BLAT gfClient host and port match your gfServer settings use DBI; use strict; # DBI my ($dsn) = "DBI:mysql:schz_db:athena.bcgsc.ca"; my ($user_name) = "schz_rw"; my ($password) = "repeat"; my ($dbh, $sth); my (@ary); ####################### # Connect to Database # ####################### $dbh = DBI->connect ($dsn, $user_name, $password, { RaiseError => 1 }); $sth = $dbh->prepare ("INSERT INTO unigene VALUES(NULL,?,?,?,?,?,?)"); my $blat_file = $ARGV[0]; print "file to blat is $blat_file\n\n"; my $total; unless ( open(FILE, "$blat_file") ) { die "Cannot open file \"$blat_file\" to write to!\n\n"; } while () { chomp; if (/^(\w+)$/) { my $seq = $1; $seq =~ s/\s+//g; my $length = length($seq); $total += $length; } } my $cutoff = 0.85*$total; print "$total\n"; print "cutoff is $cutoff\n\n"; close(FILE); print "BLAT query\n\n"; print `/home/perseusm/blat/gfClient o0001 8050 / $blat_file $blat_file.out`; my $blat_hits = "$blat_file.out"; print "open this file: $blat_hits\n\n"; close(HITS); unless ( open(HITS, "$blat_hits") ) { die "Cannot open file \"$blat_hits\" to write to!\n\n"; } while () { if (/^(\d+)\s+(\d+)\s+\d+\s+\d+\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+\S+\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+\S+\s+(\S+)\s+(\S+)/) { print " 1: $1 2: $2 3: $3 4: $4 5: $5 6: $6 7: $7 8: $8 9: $9 10: $10 11: $11 12: $12 13: $13 \n\n"; my $match = $1; my $mismatch = $2; my $qGapCount = $3; my $qGapBases = $4; my $tGapCount = $5; my $tGapBases = $6; my $query = $7; my $qSize = $8; my $qStart = $9; my $qEnd = $10; my $chr = $11; my $start = $12; my $end = $13; my $ug_cluster = $7; my $ug_sequence = $ug_cluster; $ug_cluster =~ s/^(Hs\.\S+)\|.*/\1/; $ug_sequence =~ s/^.*\|(Hs\#.*)/\1/; my $blatScore = $match - $mismatch - $qGapCount - $tGapCount; my $identity = ($match)/($match + $mismatch + $qGapCount); $chr =~ s/chr//g; $identity = $identity * 100; $identity = sprintf "%.1f",$identity; print "this is the cluster: $ug_cluster\n"; print "this is the sequence: $ug_sequence\n"; print "maps to $chr\:$start\-$end in the human genome\n"; print "identity: $identity score: $blatScore\n\n"; if (($blatScore > $cutoff) && ($identity > 90)) { print "it's a keeper\n"; print "this is the cluster: $ug_cluster\n"; print "this is the sequence: $ug_sequence\n"; print "maps to $chr\:$start\-$end in the human genome\n"; print "identity: $identity score: $blatScore\n\n"; # insert hit into database $sth->execute ($ug_cluster,$chr,$start,$end,$blatScore,$identity); } # system("rm $blat_file.out -f"); } } exit;