### Analysis of written corpus
### Experiment 4 of
### Kentner, Gerrit and Isabelle Franz (2018). 
### No evidence for prosodic effects on the syntactic encoding of complement clauses in German 

#Load data set (written corpus)
dassW <- read.csv("~/DataCode/DassWrittenCorpus.txt", sep="\t", header=T, fileEncoding="latin1")

summary(dassW)


#### explanation of the headers 
#Date - date of text (day of newspaper)
#LContext - context to left of critical region
#MatrixSubord - critical region at clause boundary: embedding verb + top of subordinate clause (with or without "dass")
#RContext - context to right of critical region
#EmbedVerb - form of embedding verb
#TopSubord - Name at top of subordinate clause
#Freq - frequency rank of TopSubord within this sample
#EmbedVLemma - lemma of embedding verb
#valid - 1 if sentence is a valid complement clause
#dass - 1 if complement clause with "dass", 0 if cc without "dass"
#embedVStress - 1 if embedding verb ends in stressed syllable, -1 if embedding verb ends in unstressed syllable
#NameStress - 1 if first syll of TopSubord bears primary stress, 0 if secondary stress, -1 if first syll of TopSubord is unstressed; "?" if stress couldn't be determined


# get string length of name at top of subordinate clause
dassW$stringL <- with(dassW, nchar(as.character(TopSubord)))

### complete dataset with names having three degrees of stress in first syllable: main stress, secondary stress, unstressed; Names with unclear stress (marked with "?") excluded
DassWvalid<-dassW[dassW$NameStress==1 |dassW$NameStress==-1 | dassW$NameStress==0,] # three categories, main stress, secondary stress, unstressed
dim(DassWvalid)

# recode NameStress to factor with 3 levels
DassWvalid$NameStressLevels <- with(DassWvalid, ifelse(NameStress==-1,1,
                                                     ifelse(NameStress==0,2,3)))

# cross-tabulate dass-mention and VerbStress and NameStress 
with(DassWvalid, xtabs(~dass))
with(DassWvalid, xtabs(~dass+embedVStress))
with(DassWvalid, xtabs(~dass+NameStressLevels))

# recode verb stress to yield orthogonal sum contrasts (.5 / -.5)
DassWvalid$VerbStress <- DassWvalid$embedVStress/2

# weakly informative priors
priorsW <- c(set_prior("normal(0, 1)", class = "Intercept"),
             set_prior("normal(0, 1)", class = "b"),
             set_prior("normal(0, 1)", class = "sd"),
             set_prior("lkj(2)", class = "cor"))


# Baysian linear mixed model with main effects of NameStress and VerbStress, frequency and string length (Name)
model1brm<-brm(dass~as.factor(NameStressLevels)+VerbStress+log(Freq)+log(stringL)+
                 (as.factor(NameStressLevels)+VerbStress|EmbedVLemma), 
               priorsW, family=bernoulli, data=DassWvalid) ### 

summary(model1brm)


samples_model1brm <- as.data.frame(model1brm)
summary(samples_model1brm)
posterior_Intercept <- samples_model1brm$b_Intercept
posterior_VerbStress <- samples_model1brm$b_VerbStress
posterior_NameStress2 <- samples_model1brm$b_as.factorNameStressLevels2
posterior_NameStress1 <- samples_model1brm$b_as.factorNameStressLevels3
posterior_logFreq <- samples_model1brm$b_logFreq
posterior_logstringL <- samples_model1brm$b_logstringL
mean(posterior_Intercept > 0)
mean(posterior_VerbStress > 0)
mean(posterior_NameStress2 > 0)
mean(posterior_NameStress1 > 0)
mean(posterior_logFreq > 0)
mean(posterior_logstringL > 0)
