---
title: "Discard Estimation"
output: html_notebook
---

```{r setup}
library(tidyverse)
library(forcats)
library(stringdist)
library(stringi)
```


The three data files are read in covering the FAO FishStat table, the fishery table and the link table.


```{r read_data}
Fishery <- read_csv("FisheryTable.csv", col_names=TRUE,
  col_types=cols(
    ID = col_integer(),
    Allocation = col_character(),
    Country = col_character(),
    Name = col_character(),
    `FAO Area` = col_character(),
    Ocean = col_character(),
    LME = col_character(),
    SA = col_character(),
    MA = col_character(),
    ManagementType = col_character(),
    Depth = col_character(),
    Location = col_character(),
    TargetSpecies = col_character(),
    Target = col_character(),
    VesselLength = col_character(),
    Gear = col_character(),
    Selectivity = col_character(),
    BRDUsed = col_character(),
    BRDType = col_character(),
    FishStat = col_character(),
    RefNotes = col_character()    )
  )
FishStat <- read_csv("FishStat.csv", col_names=TRUE,
  col_types=cols(
    `FishStat ID` = col_integer(),
    Allocation = col_character(),
    Country = col_character(),
    Spp_common = col_character(),
    Spp_scientific = col_character(),
    Spp_alpha = col_character(),
    Spp_tax = col_character(),
    `Fishing area name` = col_character(),
    `Fishing area code` = col_integer(),
    `Fishing area sub_code` = col_character(),
    Land_2010 = col_double(),
    Land_2011 = col_double(),
    Land_2012 = col_double(),
    Land_2013 = col_double(),
    Land_2014 = col_double(),
    `Land_avg_10-14` = col_double()   )
  )
FishStatLink <- read_csv("FishStatLink.csv", col_names=TRUE,
  col_types=cols(
    FisheryID = col_integer(),
    FishStatID = col_integer(),
    Percent = col_double()   )
  )
DiscardRates <- read_csv("DiscardRates.csv", col_names=TRUE,
  col_types=cols(
    ID = col_integer(),
    GearName = col_character(),
    GCode = col_character(),
    FAOArea = col_character(),
    Name = col_character(),
    LME = col_character(),
    FLAG = col_character(),
    Depth = col_character(),
    Year1 = col_integer(),
    Year2 = col_integer(),
    TargetSpecies = col_character(),
    TargetGroup = col_character(),
    VesselLength = col_character(),
    ManagementType = col_character(),
    Comment = col_character(),
    Applicability = col_character(),
    DiscardRate = col_double(),
    Disc1 = col_double(),
    Disc2 = col_double(),
    Type = col_character(),
    Robustness = col_character(),
    Source = col_character()
     )  )

```

The column names for the Fishery and FishStat are a little excentric, so we change them to simpler alternatives and drop calculated fields. The Fishtat table is also filtered to remove fisheries that are out of scope: inland fisheries. FishStatLink is not altered.

```{r  Select }
Fishery <- Fishery %>%
  mutate(Country=factor(Country)) %>%
  select(ID,
    alloc=Allocation,
    Country, Name,
    FAO=`FAO Area`, Ocean,
    LME, SA, MA, ManagementType,
    Depth, Location, 
    TargetSp=TargetSpecies, Target,
    VLength=VesselLength,
    Gear=Gear, Selectivity,
    BRDUse=BRDUsed, BRDType,
    FishStat
  )
FishStat <- FishStat %>% 
  mutate(Country=factor(Country)) %>%
  select(ID = `FishStat ID`,
    Allocation, Country,
    Spp_common,
    Spp_scientific,
    Spp_alpha,
    Spp_tax,
    AName=`Fishing area name`,
    ACode=`Fishing area code`,
    SAC=`Fishing area sub_code`,
    Land_2010, Land_2011,
    Land_2012, Land_2013,
    Land_2014  ) %>%
  filter(AName != "Europe - Inland waters", 
         AName != "Africa - Inland waters",
         AName != "America, North - Inland waters",
         AName != "America, South - Inland waters",
         AName != "Asia - Inland waters",
         AName != "Oceania - Inland waters") %>%
  gather(key="Year",value="Land",Land_2010,Land_2011,Land_2012,Land_2013,Land_2014)
  FishStat$Year <- factor(FishStat$Year)
  levels(FishStat$Year) <- c("2010","2011","2012","2013","2014")  #Convert to plain years
FishStat$Land[is.na(FishStat$Land)] <- 0   #NA's imply zero landings
```

Another is provided to ensure taxons which are out of scope (e.g. seaweed) are removed.

```{r RemoveTaxons}
ExcludeLandings <- read_csv("ExcludeLandings.csv", col_names=TRUE,
  col_types=cols(
    ID = col_integer(),
    Code = col_character(),
    Name = col_character(),
    Code3 = col_character() ))
#Remove relevant FishStat records
nrow(FishStat)
FishStat <- FishStat %>%
            anti_join(ExcludeLandings,by=c("Spp_tax"="Code"))
nrow(FishStat)
```


##Check For Duplicates  

In the following table, the number of rows in the fishery table should equal the number of distinct rows. If not, the table contains duplicates which need to be removed.

```{r CheckDuplicates}  
  #Check for duplicates
  print(c("Number of records for Current File, and With Duplicates Removed"))
  print(c(nrow(Fishery),
        nrow(distinct(select(Fishery, alloc,
             Country, Name, FAO, LME, SA, MA,
             Depth, Location, TargetSp, VLength,
             Gear, Selectivity, BRDUse, BRDType,
             FishStat)))))
```

##Check which percentages are not ok.

The following lists countries that have values less than 100% allocation of landings to fisheries, but greater than 0.

```{r checkpercent1}
tmp <- FishStatLink %>%
       group_by(FishStatID) %>%
       summarise(pc=sum(Percent)) %>%
       filter(pc < 100, pc > 0) %>%
       left_join(FishStat, by=c("FishStatID"="ID")) %>%
       mutate(Country=factor(Country), Spp_common=factor(Spp_common))
nrow(tmp)
levels(tmp$Country)
```

The following countries, if any, also need checking because they have catch allocated to fisheries of more than 100%.

```{r checkpercent2}
tmp <- FishStatLink %>%
       group_by(FishStatID) %>%
       summarise(pc=sum(Percent)) %>%
       filter(pc > 100) %>%
       left_join(FishStat,by=c("FishStatID"="ID")) %>%
       mutate(Country=factor(Country), Spp_common=factor(Spp_common))
nrow(tmp)
print("Country")
levels(tmp$Country)
print("Species")
levels(tmp$Spp_common)
rm(tmp)
```

The landings which are not allocated to a fishery is now calculated.The discards for these landings will not be included, so the value needs to be small.

```{r unallocated}
  al <- FishStat %>%
    inner_join(FishStatLink,by=c("ID"="FishStatID")) %>%
    group_by(Country) %>%
    summarise(Land=sum(Land, na.rm=TRUE))
  unal <- FishStat %>%
    anti_join(FishStatLink,by=c("ID"="FishStatID")) %>%
    group_by(Country) %>%
    summarise(uLand=sum(Land, na.rm=TRUE))
  alloc <- al %>% full_join(unal,by=c("Country"="Country"))
rm(al,unal)
nrow(alloc)
alloc[is.na(alloc)] <- 0
print("Proportion of all catches not allocated")
sum(alloc$uLand)/sum(alloc$uLand+alloc$Land)
```

Although overall unallocated landings are very low, for some countries unallocated catches are high:

```{r CountryAllocation}
alloc <- alloc %>%
  filter(uLand>0) %>%
  select(Country,AllocatedLand=Land,UnallocatedLand=uLand) %>%
  mutate(AllLand=AllocatedLand+UnallocatedLand,
         PropUnallocated=UnallocatedLand/(UnallocatedLand+AllocatedLand)) %>%
         arrange(desc(PropUnallocated),desc(UnallocatedLand))
alloc
rm(alloc)
```

Unallocated landings will need a default discard rate if they are included.

#Add Default Target Groups

For some fisheries, no target group has been provided. By default, targets (cephalopod, crustacea, finfish or shellfish) can be derived based on the taxonomy of the landings data. Note that based on this derivation, for example, shrimp trawls would be targetting finfish. Therefore this should only used to replace "NA".

```{r AddTargets}
Taxon <- read_csv("Taxon.csv", col_names=TRUE,
           cols(
             ISSCAAP = col_integer(),
             TAXOCODE = col_character(),
             `3A_CODE` = col_character(),
             Scientific_name = col_character(),
             English_name = col_character(),
             Family = col_character(),
             Order = col_character(),
             Target = col_character())   )  %>%
  select(TAXOCODE, Target)        
#Join to FishStat to get target
Targets <- FishStat %>%
  left_join(Taxon, by=c("Spp_tax"="TAXOCODE")) %>%
  left_join(FishStatLink,by=c("ID"="FishStatID")) %>%
  left_join(Fishery,by=c("FisheryID"="ID")) %>%
  mutate(Land=Land*Percent*0.01) %>%
  group_by(FisheryID, Target.y) %>%
  summarise(Land=sum(Land))
Targets <- Targets %>%
  group_by(FisheryID) %>%
  top_n(1,Land)

FisheryTargetMiss <- Fishery$ID[which(is.na(Fishery$Target))]
FisheryTargetReplaceNA <- FisheryTargetMiss %in% Targets$FisheryID[which(! is.na(Targets$Target.y))]
Fishery$ID[FisheryTargetMiss[FisheryTargetReplaceNA]]
rm(Targets, Taxon)
```

So no fishery remains which could have its target replaced using the FishStatLink information. 
  
  
#Link each Fishery to the 3 discard rate estimates

Firstly, the individual record estimates for each fishery are linked to their discard rate when there is a match.Note multiple discard estimates might exist for a fishery. In this case a mean and the maximum range are used.

```{r JoinDiscardLinkTable}
#Read in discard link table

DiscardLink <- read_csv("DiscardLink.csv", col_names=TRUE,
    col_types=cols(
          DID = col_integer(),
          GearName = col_character(),
          GCode = col_character(),
          FAO = col_character(),
          Name = col_character(),
          LME = col_character(),
          FLAG = col_character(),
          Dtarget = col_character(),
          FID = col_integer(),
          Similarity = col_double(),
          FName = col_character(),
          Country = col_character(),
          Gear = col_character(),
          FFAO = col_character(),
          FTarget = col_character()
    )  )


FishDisc1 <- DiscardLink %>%
  select(DID,FID,DName=Name,FName) %>%
  left_join(Fishery, by=c("FID"="ID")) %>%
  left_join(DiscardRates, by=c("DID"="ID")) %>%
  select(FID, DID, IE_DR=DiscardRate, 
         IE_DRlo=Disc1, IE_DRhi=Disc2, Robustness) %>%
  group_by(FID) %>%
  summarise(NRec=n(), DID=first(DID), IE_DR=mean(IE_DR), 
         IE_DRlo=min(IE_DRlo), IE_DRhi=max(IE_DRhi), Robustness=first(Robustness))
```  



```{r CountryDR}
CountryDR <- read_csv("CountrySpecificDR.csv", col_names=TRUE,
    col_types=cols(
          ID = col_integer(),
          FAOArea = col_character(),
          Name = col_character(),
          FLAG = col_character(),
          Year1 = col_integer(),
          Year2 = col_integer(),
          Comment = col_character(),
          Applicability = col_character(),
          DiscardRate = col_double(),
          Disc1 = col_double(),
          Disc2 = col_double(),
          Type = col_character(),
          Robustness = col_character(),
          Source = col_character()
    )  )
CountryDR$FLAG <- factor(CountryDR$FLAG,levels=levels(Fishery$Country))
FishDisc2 <- Fishery %>%
  left_join(CountryDR,by=c("Country"="FLAG")) %>%
  select(FID=ID.x, CO_DR=DiscardRate, CO_CIlo=Disc1, CO_CIhi=Disc2)

```



As an alternative, the gear averages can be used to estimate a discard rate. These are simple means and SE based on the observations and it is not recommended that they be used in practice because they are not based on any evaluation of the data. They are included here for comparison.

```{r GearAverage}
DiscGear <- read_csv("DiscardByGear.csv", col_names=TRUE,
    col_types=cols(
          Gear = col_character(),
          Code = col_character(),
          Mean_disc_rate = col_double(),
          Lower_95CI = col_double(),
          Upper_95CI = col_double()
    )  )

FishDisc3 <- Fishery %>%
  left_join(DiscGear,by=c("Gear"="Code")) %>%
  select(FID=ID, GE_DR=Mean_disc_rate, GE_CIlo=Lower_95CI, GE_CIhi=Upper_95CI)
```

The landings in tonnes are annual. An average is taken for further work below, but discards per year can be kept as annual so a time series of discards could be calculated by skipping the next code chunk (AverageLandings).

```{r AverageLandings}
FishStat <- FishStat %>%
  group_by(ID) %>%
  summarise(Allocation=first(Allocation), Country=first(Country),
    Spp_common=first(Spp_common), Spp_scientific=first(Spp_scientific),
    Spp_alpha=first(Spp_alpha),  Spp_tax=first(Spp_tax),
    AName=first(AName),      ACode=first(ACode),
    SAC=first(SAC),        Land=mean(Land))
```

A final table is generated joining all relevant tables to calculate the discard weight in each fishery. 

```{r CombineDiscards}
LandDisc <- FishStat %>%
  left_join(FishStatLink,by=c("ID"="FishStatID")) %>%
  left_join(Fishery,by=c("FisheryID"="ID")) %>%
  mutate(Proportion=Percent*0.01) %>%
  select(ID,Allocation,Country=Country.x,Spp_common,Spp_scientific,Spp_alpha,ACode,
         Land,FisheryID,Proportion,FisheryName=Name,Gear,Ocean,Gear,Target)    #Add Year field if AverageLandings not run above
LandDisc$FisheryID[is.na(LandDisc$FisheryID)] <- 0
LandDisc$Proportion[is.na(LandDisc$Proportion)] <- 1.0
LandDisc$FisheryName[is.na(LandDisc$FisheryName)] <- "None"

LandDisc <- LandDisc %>%
  left_join(FishDisc1, by=c("FisheryID"="FID")) %>%
  left_join(FishDisc2, by=c("FisheryID"="FID")) %>%
  left_join(FishDisc3, by=c("FisheryID"="FID"))
  
```

A preferred discard rate field with standard error is also created. This selects a discard rate in the preferred order of individual discard rate estimate (IE), the country specific estimate (CO) or a estimate based on gear type from the gear specific discard rate table (GE). Which is used is recorded in an accompanying field. The proportion of landings in each category is calculated.

For the SE and confidence intervals, the beta distribution is used, where alpha+beta=2 (sample size=1) and estimated discard rate = alpha/2.

```{r PreferredEstimate}
LandDisc$Landings <- LandDisc$Land*LandDisc$Proportion
LandDisc$DRSource <- "NA"
LandDisc$DiscRateMLE <- NA
LandDisc$DiscRate50 <- 0.0
LandDisc$Discards_95lo <- 0.0
LandDisc$Discards_95hi <- 0.0
LandDisc$DiscRate_SE <- 0.0
ESS <- 100  #Effective sample size
#Individual estimates
UseDR <- ! is.na(LandDisc$IE_DR)
LandDisc$DRSource[UseDR] <- "IE"  #Individual rate
LandDisc$DiscRateMLE[UseDR] <- LandDisc$IE_DR[UseDR]
#Country estimates
UseDR <- is.na(LandDisc$IE_DR) & ! is.na(LandDisc$CO_DR)
LandDisc$DRSource[UseDR] <- "CO"  #Country rate
LandDisc$DiscRateMLE[UseDR] <- LandDisc$CO_DR[UseDR]
#Both use the same calculation for the CI and SE
UseDR <- LandDisc$DRSource != "NA"   #! is.na(LandDisc$IE_DR) | ! is.na(LandDisc$CO_DR)
DRlo <- qbeta(0.025, ESS*LandDisc$DiscRateMLE[UseDR]+1, ESS*(1-LandDisc$DiscRateMLE[UseDR])+1)
LandDisc$Discards_95lo[UseDR] <- LandDisc$Landings[UseDR]*DRlo/(1-DRlo)
LandDisc$DiscRate50[UseDR] <- qbeta(0.5, ESS*LandDisc$DiscRateMLE[UseDR]+1, ESS*(1-LandDisc$DiscRateMLE[UseDR])+1)
DRhi <- qbeta(0.975, ESS*LandDisc$DiscRateMLE[UseDR]+1, ESS*(1-LandDisc$DiscRateMLE[UseDR])+1)
LandDisc$Discards_95hi[UseDR] <- LandDisc$Landings[UseDR]*DRhi/(1-DRhi)
LandDisc$DiscRate_SE[UseDR] <- (DRhi-DRlo)/(1.96*2)
#Gear based estimates
UseDR <- !UseDR  #Remaining records
LandDisc$DRSource[UseDR] <- "GE"  #Gear rate
LandDisc$DiscRateMLE[UseDR] <-LandDisc$GE_DR[UseDR]
LandDisc$Discards_95lo[UseDR] <- LandDisc$Landings[UseDR]*LandDisc$GE_CIlo[UseDR]/(1-LandDisc$GE_CIlo[UseDR])
LandDisc$DiscRate50[UseDR] <- LandDisc$GE_DR[UseDR]   #same as MLE
LandDisc$Discards_95hi[UseDR] <- LandDisc$Landings[UseDR]*LandDisc$GE_CIhi[UseDR]/(1-LandDisc$GE_CIhi[UseDR])
LandDisc$DiscRate_SE[UseDR] <- (LandDisc$GE_CIhi[UseDR]-LandDisc$GE_CIlo[UseDR])/(1.96*2)   #For these, use the gear SE instead
#MLE estimates
LandDisc$TotalCatchMLE <- LandDisc$Landings/(1-LandDisc$DiscRateMLE)
LandDisc$DiscardsMLE <- LandDisc$TotalCatchMLE*LandDisc$DiscRateMLE
#Median estimates
LandDisc$TotalCatch50 <- LandDisc$Landings/(1-LandDisc$DiscRate50)
LandDisc$Discards50 <- LandDisc$TotalCatch50*LandDisc$DiscRate50

#LandDisc$DiscRate_SE[UseDR] <- (LandDisc$IE_DRhi[UseDR]-LandDisc$IE_DRlo[UseDR])/(1.96*2)  #Assumes normal 95%CI
#UseDR <- UseDR & (LandDisc$DiscRate_SE==0)   #Identify cases with no range specified
#LandDisc$DiscRate_SE[UseDR] <- sqrt(LandDisc$DiscRate[UseDR]*(1-LandDisc$DiscRate[UseDR])*0.5)  #Assumes beta v=1
#LandDisc$DiscRate_SE[UseDR] <- sqrt(LandDisc$DiscRate[UseDR]*(1-LandDisc$DiscRate[UseDR])*0.5)  #Assumes beta v=1
#LandDisc$DiscRate_SE[UseDR] <- (LandDisc$CO_CIhi[UseDR]-LandDisc$CO_CIlo[UseDR])/(1.96*2)  #Assumes normal
#LandDisc$DiscRate_SE[UseDR] <- sqrt(LandDisc$DiscRate[UseDR]*(1-LandDisc$DiscRate[UseDR])*0.5)  #Assumes beta v=1
#LandDisc$DiscRate_SE[UseDR] <- (LandDisc$GE_CIhi[UseDR]-LandDisc$GE_CIlo[UseDR])/(1.96*2) #Assumes normal
#LandDisc$DiscRate_SE[UseDR] <- sqrt(LandDisc$DiscRate[UseDR]*(1-LandDisc$DiscRate[UseDR])*0.5)  #Assumes beta v=1
LandDisc$DRSource <- factor(LandDisc$DRSource)
         
#Finally remove DRSource=="NA"
LandDisc <- LandDisc %>%
  filter(DRSource!="NA",
         FisheryName!="None")

LandDisc <- LandDisc %>%
  group_by(FisheryID) %>%
  summarise(Country=first(Country),FisheryName=first(FisheryName),
    Gear=first(Gear), Ocean=first(Ocean),
    Target=first(Target),  NRec=first(NRec),
    DID=first(DID), IE_DR=first(IE_DR),
    IE_DRlo=first(IE_DRlo), IE_DRhi=first(IE_DRhi),
    CO_DR=first(CO_DR),	CO_CIlo=first(CO_CIlo),	CO_CIhi=first(CO_CIhi),
    GE_DR=first(GE_DR),	GE_CIlo=first(GE_CIlo),	GE_CIhi=first(GE_CIhi),
    Landings=sum(Landings),	DRSource=first(DRSource),
    DiscardsMLE=sum(DiscardsMLE),TotalCatchMLE=sum(TotalCatchMLE))


write.csv(LandDisc,"LandDisc.csv")

# ATTENTION!!! Note that this LandDisc file contains fisheries that has not been included in the Technical Report, i.e. fisheries whith gear=MIS + DRSource= GE. As a consequence, total landings, discards and catch do not match with total amounts in the Technical Report.

tmp <- LandDisc %>%
      group_by(DRSource) %>%
      summarise(Landings=sum(Landings)) %>%
      mutate(Proportion=Landings/sum(Landings))
tmp
```

```{r Check}

print(c("FishStat Landings" = sum(FishStat$Land), "LandDisc Landings" = sum(LandDisc$Landings)))

```

#Appendix: Additional Optional Code

##Groundtruthing
The following code creates a sample of records taken from each of the allocation groups to allow experts to check. Records are selected where landings are greater than 1 tonne.

```{r GroundTruthing, eval=FALSE}
samplesize <- 50
LandDisc$Allocation <- factor(LandDisc$Allocation)
for (alloc in levels(LandDisc$Allocation)){
  tmp <- LandDisc %>%
         filter(Allocation==alloc,Land>1.0) %>%
         sample_n(samplesize)
  write.csv(tmp,paste0("Test_",alloc,".csv"))
}
```


##Estimate SE for Groups
The following illustrates a technique to estimate standard errors when grouping by category. Because the log odds for the discard rates appear to be approximately normally distributed, we can use the standard deviation for the discard rates (primarily from the discard rate "norms" model) to estimate the standard deviation for the log odds estimate if it is provided. This in turn is the standard deviation parameter for the log-discards themselves, which are, as a direct result, log-normal. When calculating a sum of discards within a group of fisheries, we sum the log-normal variates. There is no closed form for this. It is suggested here to apply the Fenton-Wilkinson method, which is straightforward. It assumes that a sum of log-normal variates can be well approximated by a log-normal with the same mean and variance as the sum. Whether it is really a good approximation remains untested.

When adding log-normal estimates, a log-normal bias correction for the variance has also been added to obtain the expected discard in tonnes.This needs to be checked. In cases where the variance is estimated high, this can indicate a very much higher mean discard quantity compared to the median. Note that an sd/variance estimate rather than credibility intervals for these calculations.

```{r EstimatingError, eval=FALSE}
sum_LN <- function (mu,sigma2){
  #Obtains parameters for a new lognormal as a sum of lognormal values
  #using the Fenton-Wilkinson method. This generates parameters of a lognormal with the same
  #mean and variance as the summed lognormal values. It is an approximation and may not work well 
  #in all circumstances. Particularly if very low discard rates are applied, the normality assumption 
  #may not be met.
  nz <- ! is.infinite(mu)       #Removes zero discard
  mxv <- mean(mu[nz])           #Beware of overflow...
  mv <- mu[nz] - mxv
  denom <- sum(exp(mv+0.5*sigma2[nz]))
  numer <- sum(exp(2*mv + sigma2[nz])*(exp(sigma2[nz])-1))
  sig2 <- log(numer/(denom^2) + 1)
  return(c(log(denom)+mxv-0.5*sig2, sig2))
}

  LandDisc$Gear[is.na(LandDisc$Gear)] <- "MIS"
  LandDisc$Gear <- factor(LandDisc$Gear)
  GearSum <- tibble(Gear=factor(levels(LandDisc$Gear)), Discards=0, Discards_SE=0)
  for (gr in levels(LandDisc$Gear)) {
    ThisGear <- (gr==LandDisc$Gear)
    sd1 <- LandDisc$DiscRate[ThisGear]
    #For the following calculation, we set a minimum 0.01 to avoid excessively high sd's.
    # The delta-method is based on only a 1st order Taylor series expansion, so the correction is not unreasonable to avoid extreme values.
    sd1[sd1<0.01] <- 0.01 
    sd <- LandDisc$DiscRate_SE[ThisGear]/((1-sd1)*sd1)
    vr <- sd*sd
    mu <- log(LandDisc$Discards[ThisGear])-0.5*vr   #This assumes that the LandDisc$Discards[ThisGear] is the mean, otherwise a bias correction is not required
    tmp <- sum_LN(mu, vr)
    GearSum$Discards[GearSum$Gear==gr] <- exp(tmp[1]+0.5*tmp[2])
    GearSum$Discards_SE[GearSum$Gear==gr] <- sqrt(tmp[2])
  }
  GearSum
```


##Link Discard rates to landings

The following code creates a one-to-many join between the Fishery and DiscardRates table based on a simple similarity index to aid matching records. This has been useful to check for possible matches. However, subsequent manual editing of matches was undertaken in a spreadsheet, so this code is no longer useful.

```{r CreateDiscardLink, eval=FALSE}
ContainV <- function(TestV,InV){
  lv <- as.vector(stri_split_fixed(InV,",",simplify=TRUE))
  tmp <- matrix(0,nrow=length(TestV),ncol=length(lv))
  for (i in 1:length(lv)){
    tmp[,i] <- stri_detect_fixed(TestV,lv[i])
  }  
  return(rowSums(tmp))
}

#Name    Country Gear    FAOArea Depth   TargetSp    VesselLength    Year
wt <- c(1,1,5,1,1,1,1,1)
DiscardRates$Name[is.na(DiscardRates$Name)] <- ""
DiscardRates$FAOArea[is.na(DiscardRates$FAOArea)] <- ""
DiscardRates$GCode[is.na(DiscardRates$GCode)] <- ""
DiscardRates$Depth[is.na(DiscardRates$Depth)] <- ""
DiscardRates$TargetSpecies[is.na(DiscardRates$TargetSpecies)] <- ""
DiscardRates$VesselLength[is.na(DiscardRates$VesselLength)] <- ""
DiscardRates$Year2[is.na(DiscardRates$Year2)] <- 2005
Fishery$FAO[is.na(Fishery$FAO)] <- ""
Fishery$Depth[is.na(Fishery$Depth)] <- ""
Fishery$TargetSp[is.na(Fishery$TargetSp)] <- ""
Fishery$VLength[is.na(Fishery$VLength)] <- ""
Fishery$Gear[is.na(Fishery$Gear)] <- ""

DiscID <- integer(nrow(Fishery))
Similarity <- double(nrow(Fishery))
i <- 1
for (i in 1:nrow(Fishery)){
  simil <- stringdist(Fishery$Name[i],DiscardRates$Name,method="osa")
  mxs <- max(simil)
  simil <- wt[1]*(mxs-simil)/mxs
  simil <- simil+(Fishery$Country[i]==DiscardRates$FLAG)*wt[2]+
                 (Fishery$Gear[i]==DiscardRates$GCode)*wt[3]+
                 (Fishery$Depth[i]==DiscardRates$Depth)*wt[5]+
                 (Fishery$VLength[i]==DiscardRates$VesselLength)*wt[7]+
                 (DiscardRates$Year2 > 2010)*wt[8] 
  if (length(Fishery$FAO[i]) > 1){
    simil <- simil + ContainV(DiscardRates$FAOArea,Fishery$FAO[i])*wt[4]  }
  if (length(Fishery$TargetSp[i]) > 1){
    simil <- simil + ContainV(DiscardRates$TargetSpecies,Fishery$TargetSp[i])*wt[6] }
  DiscID[i] <- which.max(simil)
  Similarity[i] <- simil[DiscID[i]]
}
#Index link file joining fisheries to discard rates 
DiscardLinkx <- tibble(FID=c(0,Fishery$ID),DID=c(0,DiscID),Similarity=c(0,Similarity))

```