数据结构如下图 已知Year和Month列有重复,目的是根据Year和Month分组,计算Temperature的平均值(Temperature中存在NA值,求平均值时需要去除NA值后计算均值)
perl语言版本如下
#!/usr/bin/perl -w
use strict;
use warnings;
my $usage=<<USAGE;
Usage:
perl $0 inputfile
USAGE
if(@ARGV==0){die $usage};
my $file=$ARGV[0];
my @data=();
my %hash_year=();
my %hash_month=();
open(RF,$file) || die $!;
open(WF,">process_1.txt") || die $!;
while(my $line=<RF>){
chomp($line);
next if ($.==1);
my @arr=split('\t',$line);
$hash_year{$arr[10]}=1;
$hash_month{$arr[11]}=1;
print WF $arr[10],"\t",$arr[11],"\t",$arr[14],"\n";
}
close(RF);
close(WF);
my $i=0;
open(WF,">average.txt") || die $!;
for my $key_year (sort {$a <=> $b}keys %hash_year){
for my $key_month (sort {$a <=> $b}keys %hash_month){
my @value=();
open(RF,"process_1.txt") || die $!;
while(my $line=<RF>){
chomp($line);
my @arr=split('\t',$line);
if ($arr[0]==$key_year && $arr[1]==$key_month) {
push @value,$arr[2];
}
}
close(RF);
if ($i==0){
print WF "Year\tMonth\tTemperature\n";
}
if (scalar @value > 0 ){
my $average=&average(@value);
print WF $key_year,"\t",$key_month,"\t",$average,"\n";
}
$i++;
}
}
close(WF);
system("del process_1.txt");
sub average{
my @num=@_;
my $j=0;
my $total=0;
my $result;
for my $i (0..$#num){
next if ($num[$i] eq 'NA');
$total=$total + $num[$i];
$j++;
}
$result=($total/$j);
return $result;
}
R语言for循环版本如下
dz_test <- data[,c("Year","Month","Temperature")]
a <- data.frame()
for (i in dz_test[!duplicated(dz_test$Year),]$Year){
for (j in dz_test[!duplicated(dz_test$Month),]$Month){
year <- dz_test[dz_test$Year==i,]
month <- year[year$Month==j,]
b <- cbind(i,j,mean(month$Temperature,na.rm=T))
a <- rbind(a,b)
}
}
names(a) <- names(dz_test)
a
最简单也比较快的方法是使用R语言的tidyverse包
library(tidyverse)
data <- data.table::fread("Temperature.txt",data.table = F)
results1 <- data %>%
group_by(Year,Month) %>%
summarise(Mean=mean(Temperature,na.rm=T))
得到如下图的结果