library(lavaan)
library(lavaanPlot)
library(fastDummies)
library(ggcorrplot)
library(candisc)

Az adatok betöltése:

# Szűrés, adatok rendezése, végső adattábla elkészítése
data <- read.csv("C:/Users/Dell/Downloads/20230403.csv", stringsAsFactors = FALSE)

# Hajtás szűrés (első, hátsó, összkerék)
hajtas_tipus <- c("Első kerék", "Hátsó kerék", "Összkerék")
data <- data[data$Hajtás %in% hajtas_tipus, ]

# Klíma kategorizálása (digitális, manuális, automata)
data$Klima <- ifelse(
  grepl("digitális", data$Klíma.fajtája, ignore.case = TRUE), "Digitális",
  ifelse(grepl("automata", data$Klíma.fajtája, ignore.case = TRUE), "Automata",
         ifelse(grepl("manuális", data$Klíma.fajtája, ignore.case = TRUE), "Manuális", NA))
)
data <- data[!is.na(data$Klima), ]

# üzemanyag szerinti szűrés
data <- data[grepl("Benzin|Dízel|Elektromos", data$Üzemanyag., ignore.case = TRUE), ]

# Csak a három kategória megtartása pontosított nevekkel
data$uzemanyag <- ifelse(
  grepl("Benzin", data$Üzemanyag., ignore.case = TRUE), "Benzin",
  ifelse(grepl("Dízel", data$Üzemanyag., ignore.case = TRUE), "Dízel",
         ifelse(grepl("Elektromos", data$Üzemanyag., ignore.case = TRUE), "Elektromos", NA))
)

# outlier-szűrés (személyek száma)
szemszam_outlier <- sort(unique(data$Szállítható.szem..száma), decreasing = TRUE)[1:3]
data <- data[!data$Szállítható.szem..száma %in% szemszam_outlier, ]

# Márka alapján az első 22 legtöbb darabszámú márka kiválasztása
nepszeru_marka <- names(sort(table(data$márka), decreasing = TRUE)[1:22])
data <- data[data$márka %in% nepszeru_marka, ]

# Sebességváltó kategorizálása (automata, manuális)
data$sebvalto <- ifelse(
  grepl("automata", data$Sebességváltó, ignore.case = TRUE), "automata",
  ifelse(grepl("manuális", data$Sebességváltó, ignore.case = TRUE), "manualis", NA)
)
data <- data[!is.na(data$sebvalto), ]

# kor változó létrehozása
data$kor <- 2023-data$Évjárat.

# redundáns változók elhagyása
data <- data[, -c(1,4,6,7,10,11,13,15,19,20,22)]
data <- na.omit(data)

# ár logaritmusát vesszük
hist(log(data$Vételár.))

data$price <- log(data$Vételár.)
data <- data[, -c(3,11)] # eredeti vételár és márkák kiszedése

Sztenderdizálás és dummy-k

car <- dummy_cols(data, remove_selected_columns = TRUE, 
                  remove_most_frequent_dummy = TRUE)
car_st <- data.frame(scale(car))

colnames(car_st)[colnames(car_st) == "Hengerűrtartalom."] <- "henger"
colnames(car_st)[colnames(car_st) == "Teljesítmény."] <- "teljesitmeny"
colnames(car_st)[colnames(car_st) == "Teljes.tömeg."] <- "tomeg"
colnames(car_st)[colnames(car_st) == "Km..óra.állás."] <- "ut"
colnames(car_st)[colnames(car_st) == "Szállítható.szem..száma."] <- "szemelyszam"
colnames(car_st)[colnames(car_st) == "Csomagtartó."] <- "csomagtarto"
colnames(car_st)[colnames(car_st) == "uzemanyag_Elektromos"] <- "elektromos"
colnames(car_st)[colnames(car_st) == "uzemanyag_Dízel"] <- "dizel"
colnames(car_st)[colnames(car_st) == "sebvalto_automata"] <- "automatasebvalto"
colnames(car_st)[colnames(car_st) == "Állapot._Kitűnő"] <- "kituno"
colnames(car_st)[colnames(car_st) == "Állapot._Sérülésmentes"] <- "serulesmentes"
colnames(car_st)[colnames(car_st) == "Állapot._Újszerű"] <- "ujszeru"
colnames(car_st)[colnames(car_st) == "Állapot._Megkímélt"] <- "megkimelt"
colnames(car_st)[colnames(car_st) == "Hajtás._Összkerék"] <- "osszkerek"
colnames(car_st)[colnames(car_st) == "Hajtás._Hátsó.kerék"] <- "hatsokerek"
colnames(car_st)[colnames(car_st) == "Klima_Automata"] <- "automataklima"
colnames(car_st)[colnames(car_st) == "Klima_Manuális"] <- "manualisklima"

ggcorrplot(cor(car_st[, c(1:8, 25:31)]))

Modellépítés

  1. kezdeti modell
mod1 <- "price~teljesitmeny+ut+kor+tomeg"
sem1 <- sem(mod1, data=car_st, estimator="MLF")
summary(sem1, fit=TRUE) #CFI:1.000; TLI:1.000; RMSEA:0.000
## lavaan 0.6-19 ended normally after 1 iteration
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                         5
## 
##   Number of observations                         52698
## 
## Model Test User Model:
##                                                       
##   Test statistic                                 0.000
##   Degrees of freedom                                 0
## 
## Model Test Baseline Model:
## 
##   Test statistic                            107731.217
##   Degrees of freedom                                 4
##   P-value                                        0.000
## 
## User Model versus Baseline Model:
## 
##   Comparative Fit Index (CFI)                    1.000
##   Tucker-Lewis Index (TLI)                       1.000
## 
## Loglikelihood and Information Criteria:
## 
##   Loglikelihood user model (H0)             -20909.114
##   Loglikelihood unrestricted model (H1)     -20909.114
##                                                       
##   Akaike (AIC)                               41828.229
##   Bayesian (BIC)                             41872.591
##   Sample-size adjusted Bayesian (SABIC)      41856.700
## 
## Root Mean Square Error of Approximation:
## 
##   RMSEA                                          0.000
##   90 Percent confidence interval - lower         0.000
##   90 Percent confidence interval - upper         0.000
##   P-value H_0: RMSEA <= 0.050                       NA
##   P-value H_0: RMSEA >= 0.080                       NA
## 
## Standardized Root Mean Square Residual:
## 
##   SRMR                                           0.000
## 
## Parameter Estimates:
## 
##   Standard errors                             Standard
##   Information                              First.order
##   Information saturated (h1) model          Structured
## 
## Regressions:
##                    Estimate  Std.Err  z-value  P(>|z|)
##   price ~                                             
##     teljesitmeny      0.389    0.001  365.919    0.000
##     ut               -0.080    0.000 -824.010    0.000
##     kor              -0.689    0.001 -605.822    0.000
##     tomeg             0.066    0.000  622.137    0.000
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)
##    .price             0.129    0.000 1228.093    0.000
lavaanPlot(sem1, coef=TRUE, sig=0.05)
  1. modell
mod2 <- "price~teljesitmeny+ut+kor+tomeg+automatasebvalto+dizel
         ut~kor
         automatasebvalto~kor
         dizel~tomeg"
         
sem2 <- sem(mod2, data=car_st, estimator="MLF")
summary(sem2, fit=TRUE) #CFI:0.843; TLI:0.686; RMSEA:0.234
## lavaan 0.6-19 ended normally after 1 iteration
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                        13
## 
##   Number of observations                         52698
## 
## Model Test User Model:
##                                                        
##   Test statistic                              25889.827
##   Degrees of freedom                                  9
##   P-value (Chi-square)                            0.000
## 
## Model Test Baseline Model:
## 
##   Test statistic                            164893.359
##   Degrees of freedom                                18
##   P-value                                        0.000
## 
## User Model versus Baseline Model:
## 
##   Comparative Fit Index (CFI)                    0.843
##   Tucker-Lewis Index (TLI)                       0.686
## 
## Loglikelihood and Information Criteria:
## 
##   Loglikelihood user model (H0)            -229597.125
##   Loglikelihood unrestricted model (H1)    -216652.212
##                                                       
##   Akaike (AIC)                              459220.250
##   Bayesian (BIC)                            459335.590
##   Sample-size adjusted Bayesian (SABIC)     459294.276
## 
## Root Mean Square Error of Approximation:
## 
##   RMSEA                                          0.234
##   90 Percent confidence interval - lower         0.231
##   90 Percent confidence interval - upper         0.236
##   P-value H_0: RMSEA <= 0.050                    0.000
##   P-value H_0: RMSEA >= 0.080                    1.000
## 
## Standardized Root Mean Square Residual:
## 
##   SRMR                                           0.137
## 
## Parameter Estimates:
## 
##   Standard errors                             Standard
##   Information                              First.order
##   Information saturated (h1) model          Structured
## 
## Regressions:
##                      Estimate  Std.Err  z-value  P(>|z|)
##   price ~                                               
##     teljesitmeny        0.331    0.002  207.029    0.000
##     ut                 -0.089    0.002  -51.502    0.000
##     kor                -0.664    0.001 -524.020    0.000
##     tomeg               0.046    0.003   15.761    0.000
##     automatasebvlt      0.119    0.002   59.579    0.000
##     dizel               0.022    0.002   11.786    0.000
##   ut ~                                                  
##     kor                 0.522    0.005  106.386    0.000
##   automatasebvalto ~                                    
##     kor                -0.311    0.005  -64.490    0.000
##   dizel ~                                               
##     tomeg               0.304    0.006   54.766    0.000
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)
##    .price             0.120    0.000 1265.745    0.000
##    .ut                0.728    0.001  517.367    0.000
##    .automatasebvlt    0.903    0.010   89.803    0.000
##    .dizel             0.908    0.023   39.682    0.000
lavaanPlot(sem2, coef=TRUE, sig=0.05)

Bár mindegyik kapcsolat szignifikáns, a mutatók alapján romlott a modell becslőereje.

  1. modell
mod3 <- "price~teljesitmeny+ut+kor+tomeg+automatasebvalto+dizel+
kituno+megkimelt+serulesmentes+ujszeru
         ut~kor
         automatasebvalto~kor
         dizel~tomeg
         "
sem3 <- sem(mod3, data=car_st, estimator="MLF")
summary(sem3, fit=TRUE) #CFI:0.838; TLI:0.738; RMSEA:0.157
## lavaan 0.6-19 ended normally after 2 iterations
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                        17
## 
##   Number of observations                         52698
## 
## Model Test User Model:
##                                                        
##   Test statistic                              27267.907
##   Degrees of freedom                                 21
##   P-value (Chi-square)                            0.000
## 
## Model Test Baseline Model:
## 
##   Test statistic                            168472.323
##   Degrees of freedom                                34
##   P-value                                        0.000
## 
## User Model versus Baseline Model:
## 
##   Comparative Fit Index (CFI)                    0.838
##   Tucker-Lewis Index (TLI)                       0.738
## 
## Loglikelihood and Information Criteria:
## 
##   Loglikelihood user model (H0)            -228496.683
##   Loglikelihood unrestricted model (H1)    -214862.730
##                                                       
##   Akaike (AIC)                              457027.366
##   Bayesian (BIC)                            457178.196
##   Sample-size adjusted Bayesian (SABIC)     457124.170
## 
## Root Mean Square Error of Approximation:
## 
##   RMSEA                                          0.157
##   90 Percent confidence interval - lower         0.155
##   90 Percent confidence interval - upper         0.158
##   P-value H_0: RMSEA <= 0.050                    0.000
##   P-value H_0: RMSEA >= 0.080                    1.000
## 
## Standardized Root Mean Square Residual:
## 
##   SRMR                                           0.091
## 
## Parameter Estimates:
## 
##   Standard errors                             Standard
##   Information                              First.order
##   Information saturated (h1) model          Structured
## 
## Regressions:
##                      Estimate  Std.Err  z-value  P(>|z|)
##   price ~                                               
##     teljesitmeny        0.325    0.002  203.947    0.000
##     ut                 -0.080    0.002  -43.284    0.000
##     kor                -0.642    0.001 -496.102    0.000
##     tomeg               0.045    0.003   15.767    0.000
##     automatasebvlt      0.115    0.002   58.958    0.000
##     dizel               0.024    0.002   13.126    0.000
##     kituno              0.074    0.002   40.102    0.000
##     megkimelt           0.048    0.002   27.128    0.000
##     serulesmentes       0.035    0.002   18.578    0.000
##     ujszeru             0.060    0.002   32.216    0.000
##   ut ~                                                  
##     kor                 0.522    0.005  105.816    0.000
##   automatasebvalto ~                                    
##     kor                -0.311    0.005  -64.410    0.000
##   dizel ~                                               
##     tomeg               0.304    0.006   53.674    0.000
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)
##    .price             0.115    0.000 1201.842    0.000
##    .ut                0.728    0.001  524.397    0.000
##    .automatasebvlt    0.903    0.010   89.675    0.000
##    .dizel             0.908    0.023   39.226    0.000
lavaanPlot(sem3, coef=TRUE, sig=0.05)

Határozottan jobb, mint az előző, de nem elfogadható így sem sajnos.

Látens változó a modellben –> az autó nagysága

mod4<-"luxus=~teljesitmeny+automatasebvalto+szemelyszam+henger
       ut~kor
       automatasebvalto~kor
       dizel~tomeg
       price~teljesitmeny+ut+kor+tomeg+automatasebvalto+dizel+luxus
       "
sem4 <- sem(mod4, data=car_st, estimator="MLF", std.lv=TRUE)
summary(sem4, fit=TRUE) #CFI:0.819; TLI:0.699; RMSEA:0.212
## lavaan 0.6-19 ended normally after 45 iterations
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                        21
## 
##   Number of observations                         52698
## 
## Model Test User Model:
##                                                        
##   Test statistic                              49904.084
##   Degrees of freedom                                 21
##   P-value (Chi-square)                            0.000
## 
## Model Test Baseline Model:
## 
##   Test statistic                            276133.184
##   Degrees of freedom                                35
##   P-value                                        0.000
## 
## User Model versus Baseline Model:
## 
##   Comparative Fit Index (CFI)                    0.819
##   Tucker-Lewis Index (TLI)                       0.699
## 
## Loglikelihood and Information Criteria:
## 
##   Loglikelihood user model (H0)            -410308.510
##   Loglikelihood unrestricted model (H1)    -385356.468
##                                                       
##   Akaike (AIC)                              820659.019
##   Bayesian (BIC)                            820845.338
##   Sample-size adjusted Bayesian (SABIC)     820778.600
## 
## Root Mean Square Error of Approximation:
## 
##   RMSEA                                          0.212
##   90 Percent confidence interval - lower         0.211
##   90 Percent confidence interval - upper         0.214
##   P-value H_0: RMSEA <= 0.050                    0.000
##   P-value H_0: RMSEA >= 0.080                    1.000
## 
## Standardized Root Mean Square Residual:
## 
##   SRMR                                           0.152
## 
## Parameter Estimates:
## 
##   Standard errors                             Standard
##   Information                              First.order
##   Information saturated (h1) model          Structured
## 
## Latent Variables:
##                    Estimate  Std.Err  z-value  P(>|z|)
##   luxus =~                                            
##     teljesitmeny      0.907    0.004  217.281    0.000
##     automatasebvlt    0.570    0.007   87.588    0.000
##     szemelyszam      -0.040    0.004   -9.896    0.000
##     henger            0.930    0.003  285.723    0.000
## 
## Regressions:
##                      Estimate  Std.Err  z-value  P(>|z|)
##   ut ~                                                  
##     kor                 0.522    0.005  105.445    0.000
##   automatasebvalto ~                                    
##     kor                -0.264    0.004  -66.019    0.000
##   dizel ~                                               
##     tomeg               0.304    0.005   56.526    0.000
##   price ~                                               
##     teljesitmeny        0.183    0.007   27.391    0.000
##     ut                 -0.091    0.002  -53.013    0.000
##     kor                -0.694    0.002 -407.348    0.000
##     tomeg               0.042    0.003   13.421    0.000
##     automatasebvlt      0.098    0.002   45.333    0.000
##     dizel               0.005    0.002    2.755    0.006
##     luxus               0.173    0.007   24.158    0.000
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)
##    .teljesitmeny      0.178    0.003   53.909    0.000
##    .automatasebvlt    0.580    0.005  123.478    0.000
##    .szemelyszam       0.998    0.003  390.038    0.000
##    .henger            0.134    0.003   39.293    0.000
##    .ut                0.728    0.001  509.435    0.000
##    .dizel             0.908    0.023   39.652    0.000
##    .price             0.116    0.000  471.333    0.000
##     luxus             1.000
lavaanPlot(sem4, coef=TRUE, sig=0.05)

Nem igazán javult tőle (egyre rosszabb).

Másik látens változó bevezetése

mod5<-"nagy=~henger+csomagtarto+szemelyszam+tomeg
       ut~kor
       automatasebvalto~kor
       dizel~tomeg
       price~teljesitmeny+ut+kor+tomeg+automatasebvalto+dizel+nagy
       "
sem5 <- sem(mod5, data=car_st, estimator="MLF", std.lv=TRUE)
summary(sem5, fit=TRUE) #CFI:0.561; TLI:0.356; RMSEA:0.283
## lavaan 0.6-19 ended normally after 29 iterations
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                        22
## 
##   Number of observations                         52698
## 
## Model Test User Model:
##                                                         
##   Test statistic                              126249.160
##   Degrees of freedom                                  30
##   P-value (Chi-square)                             0.000
## 
## Model Test Baseline Model:
## 
##   Test statistic                            287401.590
##   Degrees of freedom                                44
##   P-value                                        0.000
## 
## User Model versus Baseline Model:
## 
##   Comparative Fit Index (CFI)                    0.561
##   Tucker-Lewis Index (TLI)                       0.356
## 
## Loglikelihood and Information Criteria:
## 
##   Loglikelihood user model (H0)            -517621.568
##   Loglikelihood unrestricted model (H1)    -454496.988
##                                                       
##   Akaike (AIC)                             1035287.136
##   Bayesian (BIC)                           1035482.327
##   Sample-size adjusted Bayesian (SABIC)    1035412.411
## 
## Root Mean Square Error of Approximation:
## 
##   RMSEA                                          0.283
##   90 Percent confidence interval - lower         0.281
##   90 Percent confidence interval - upper         0.284
##   P-value H_0: RMSEA <= 0.050                    0.000
##   P-value H_0: RMSEA >= 0.080                    1.000
## 
## Standardized Root Mean Square Residual:
## 
##   SRMR                                           0.208
## 
## Parameter Estimates:
## 
##   Standard errors                             Standard
##   Information                              First.order
##   Information saturated (h1) model          Structured
## 
## Latent Variables:
##                    Estimate  Std.Err  z-value  P(>|z|)
##   nagy =~                                             
##     henger            0.616    0.007   86.683    0.000
##     csomagtarto       0.457    0.006   75.983    0.000
##     szemelyszam       0.144    0.006   25.143    0.000
##     tomeg             0.762    0.009   87.809    0.000
## 
## Regressions:
##                      Estimate  Std.Err  z-value  P(>|z|)
##   ut ~                                                  
##     kor                 0.522    0.005  106.198    0.000
##   automatasebvalto ~                                    
##     kor                -0.311    0.005  -63.008    0.000
##   dizel ~                                               
##     tomeg               0.304    0.019   16.378    0.000
##   price ~                                               
##     teljesitmeny        0.293    0.002  123.038    0.000
##     ut                 -0.093    0.002  -53.749    0.000
##     kor                -0.671    0.001 -477.848    0.000
##     tomeg              -0.040    0.008   -5.123    0.000
##     automatasebvlt      0.114    0.002   56.968    0.000
##     dizel               0.008    0.002    4.426    0.000
##     nagy                0.142    0.008   17.887    0.000
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)
##    .henger            0.620    0.006   99.947    0.000
##    .csomagtarto       0.791    0.006  143.326    0.000
##    .szemelyszam       0.979    0.003  282.244    0.000
##    .tomeg             0.419    0.010   41.057    0.000
##    .ut                0.728    0.001  498.103    0.000
##    .automatasebvlt    0.903    0.011   83.908    0.000
##    .dizel             0.908    0.051   17.759    0.000
##    .price             0.113    0.001  167.496    0.000
##     nagy              1.000
lavaanPlot(sem5, coef=TRUE, sig=0.05)

Erről ne is beszéljünk…

mod6<-" ut~kor
        regi=~manualisklima+automatasebvalto+kor+ut
        automatasebvalto~kor
        dizel~tomeg
        price~teljesitmeny+ut+kor+tomeg+automatasebvalto+dizel+regi
       "
sem6 <- sem(mod6, data=car_st, estimator="MLF", std.lv=TRUE)
summary(sem6, fit=TRUE) #CFI:0.813; TLI:0.612; RMSEA:0.225
## lavaan 0.6-19 ended normally after 46 iterations
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                        20
## 
##   Number of observations                         52698
## 
## Model Test User Model:
##                                                        
##   Test statistic                              34663.976
##   Degrees of freedom                                 13
##   P-value (Chi-square)                            0.000
## 
## Model Test Baseline Model:
## 
##   Test statistic                            185362.648
##   Degrees of freedom                                27
##   P-value                                        0.000
## 
## User Model versus Baseline Model:
## 
##   Comparative Fit Index (CFI)                    0.813
##   Tucker-Lewis Index (TLI)                       0.612
## 
## Loglikelihood and Information Criteria:
## 
##   Loglikelihood user model (H0)            -373299.001
##   Loglikelihood unrestricted model (H1)    -355967.013
##                                                       
##   Akaike (AIC)                              746638.002
##   Bayesian (BIC)                            746815.448
##   Sample-size adjusted Bayesian (SABIC)     746751.888
## 
## Root Mean Square Error of Approximation:
## 
##   RMSEA                                          0.225
##   90 Percent confidence interval - lower         0.223
##   90 Percent confidence interval - upper         0.227
##   P-value H_0: RMSEA <= 0.050                    0.000
##   P-value H_0: RMSEA >= 0.080                    1.000
## 
## Standardized Root Mean Square Residual:
## 
##   SRMR                                           0.182
## 
## Parameter Estimates:
## 
##   Standard errors                             Standard
##   Information                              First.order
##   Information saturated (h1) model          Structured
## 
## Latent Variables:
##                    Estimate  Std.Err  z-value  P(>|z|)
##   regi =~                                             
##     manualisklima     0.810    0.031   26.515    0.000
##     automatasebvlt   -0.397    0.019  -20.722    0.000
##     kor               0.331    0.014   23.736    0.000
##     ut               -0.157    0.009  -16.992    0.000
## 
## Regressions:
##                      Estimate  Std.Err  z-value  P(>|z|)
##   ut ~                                                  
##     kor                 0.574    0.007   88.019    0.000
##   automatasebvalto ~                                    
##     kor                -0.180    0.012  -15.201    0.000
##   dizel ~                                               
##     tomeg               0.304    0.005   61.213    0.000
##   price ~                                               
##     teljesitmeny        0.316    0.002  193.147    0.000
##     ut                 -0.099    0.002  -45.069    0.000
##     kor                -0.642    0.003 -210.268    0.000
##     tomeg               0.041    0.003   14.240    0.000
##     automatasebvlt      0.097    0.004   26.781    0.000
##     dizel               0.012    0.002    6.652    0.000
##     regi               -0.087    0.006  -14.484    0.000
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)
##    .manualisklima     0.344    0.049    6.986    0.000
##    .automatasebvlt    0.762    0.016   48.502    0.000
##    .kor               0.891    0.010   84.836    0.000
##    .ut                0.706    0.002  330.303    0.000
##    .dizel             0.908    0.023   39.624    0.000
##    .price             0.115    0.001  197.961    0.000
##     regi              1.000
lavaanPlot(sem6, coef=TRUE, sig=0.05)

Still nem okes.

  1. modell
mod7<-"nagy=~tomeg+henger+teljesitmeny+dizel
       ut~kor
       automatasebvalto~kor
       dizel~tomeg
       price~teljesitmeny+ut+kor+tomeg+automatasebvalto
       "
sem7 <- sem(mod7, data=car_st, estimator="MLF", std.lv=TRUE)
summary(sem7, fit=TRUE) #CFI:0.803; TLI:0.655; RMSEA:0.253
## lavaan 0.6-19 ended normally after 29 iterations
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                        19
## 
##   Number of observations                         52698
## 
## Model Test User Model:
##                                                        
##   Test statistic                              53807.994
##   Degrees of freedom                                 16
##   P-value (Chi-square)                            0.000
## 
## Model Test Baseline Model:
## 
##   Test statistic                            273204.321
##   Degrees of freedom                                28
##   P-value                                        0.000
## 
## User Model versus Baseline Model:
## 
##   Comparative Fit Index (CFI)                    0.803
##   Tucker-Lewis Index (TLI)                       0.655
## 
## Loglikelihood and Information Criteria:
## 
##   Loglikelihood user model (H0)            -413724.896
##   Loglikelihood unrestricted model (H1)    -386820.899
##                                                       
##   Akaike (AIC)                              827487.793
##   Bayesian (BIC)                            827656.367
##   Sample-size adjusted Bayesian (SABIC)     827595.985
## 
## Root Mean Square Error of Approximation:
## 
##   RMSEA                                          0.253
##   90 Percent confidence interval - lower         0.251
##   90 Percent confidence interval - upper         0.254
##   P-value H_0: RMSEA <= 0.050                    0.000
##   P-value H_0: RMSEA >= 0.080                    1.000
## 
## Standardized Root Mean Square Residual:
## 
##   SRMR                                           0.179
## 
## Parameter Estimates:
## 
##   Standard errors                             Standard
##   Information                              First.order
##   Information saturated (h1) model          Structured
## 
## Latent Variables:
##                    Estimate  Std.Err  z-value  P(>|z|)
##   nagy =~                                             
##     tomeg             0.475    0.007   72.234    0.000
##     henger            0.985    0.005  191.145    0.000
##     teljesitmeny      0.857    0.005  160.952    0.000
##     dizel             0.210    0.014   15.376    0.000
## 
## Regressions:
##                      Estimate  Std.Err  z-value  P(>|z|)
##   ut ~                                                  
##     kor                 0.522    0.005  105.286    0.000
##   automatasebvalto ~                                    
##     kor                -0.311    0.005  -60.641    0.000
##   dizel ~                                               
##     tomeg               0.204    0.013   15.515    0.000
##   price ~                                               
##     teljesitmeny        0.330    0.002  166.085    0.000
##     ut                 -0.082    0.002  -47.793    0.000
##     kor                -0.667    0.001 -520.362    0.000
##     tomeg               0.052    0.003   18.454    0.000
##     automatasebvlt      0.121    0.002   60.319    0.000
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)
##    .tomeg             0.774    0.003  280.956    0.000
##    .henger            0.030    0.006    5.055    0.000
##    .teljesitmeny      0.266    0.005   58.653    0.000
##    .dizel             0.873    0.048   18.341    0.000
##    .ut                0.728    0.001  561.367    0.000
##    .automatasebvlt    0.903    0.011   82.820    0.000
##    .price             0.120    0.000 1232.742    0.000
##     nagy              1.000
lavaanPlot(sem7, coef=TRUE, sig=0.05)
cor(car_st$henger,car_st$tomeg)
## [1] 0.4627088
  1. modell
mod8<-" ut~kor
        regi=~manualisklima+automatasebvalto+kor+ut
        dizel~tomeg
        price~teljesitmeny+ut+kor+tomeg+automatasebvalto+regi
       "
sem8 <- sem(mod8, data=car_st, estimator="MLF", std.lv=TRUE)
summary(sem8, fit=TRUE) #CFI:0.812; TLI:0.637; RMSEA:0.218
## lavaan 0.6-19 ended normally after 29 iterations
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                        19
## 
##   Number of observations                         52698
## 
## Model Test User Model:
##                                                        
##   Test statistic                              34917.790
##   Degrees of freedom                                 14
##   P-value (Chi-square)                            0.000
## 
## Model Test Baseline Model:
## 
##   Test statistic                            185362.648
##   Degrees of freedom                                27
##   P-value                                        0.000
## 
## User Model versus Baseline Model:
## 
##   Comparative Fit Index (CFI)                    0.812
##   Tucker-Lewis Index (TLI)                       0.637
## 
## Loglikelihood and Information Criteria:
## 
##   Loglikelihood user model (H0)            -373425.908
##   Loglikelihood unrestricted model (H1)    -355967.013
##                                                       
##   Akaike (AIC)                              746889.816
##   Bayesian (BIC)                            747058.390
##   Sample-size adjusted Bayesian (SABIC)     746998.008
## 
## Root Mean Square Error of Approximation:
## 
##   RMSEA                                          0.218
##   90 Percent confidence interval - lower         0.216
##   90 Percent confidence interval - upper         0.219
##   P-value H_0: RMSEA <= 0.050                    0.000
##   P-value H_0: RMSEA >= 0.080                    1.000
## 
## Standardized Root Mean Square Residual:
## 
##   SRMR                                           0.183
## 
## Parameter Estimates:
## 
##   Standard errors                             Standard
##   Information                              First.order
##   Information saturated (h1) model          Structured
## 
## Latent Variables:
##                    Estimate  Std.Err  z-value  P(>|z|)
##   regi =~                                             
##     manualisklima     0.589    0.007   79.644    0.000
##     automatasebvlt   -0.629    0.009  -73.394    0.000
##     kor               0.477    0.007   73.037    0.000
##     ut               -0.174    0.009  -19.861    0.000
## 
## Regressions:
##                    Estimate  Std.Err  z-value  P(>|z|)
##   ut ~                                                
##     kor               0.605    0.006   97.856    0.000
##   dizel ~                                             
##     tomeg             0.304    0.005   61.220    0.000
##   price ~                                             
##     teljesitmeny      0.316    0.002  193.187    0.000
##     ut               -0.110    0.002  -54.115    0.000
##     kor              -0.599    0.003 -200.946    0.000
##     tomeg             0.044    0.003   16.868    0.000
##     automatasebvlt    0.039    0.004   10.703    0.000
##     regi             -0.175    0.006  -29.080    0.000
## 
## Covariances:
##                    Estimate  Std.Err  z-value  P(>|z|)
##  .dizel ~~                                            
##    .price             0.011    0.002    6.542    0.000
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)
##    .manualisklima     0.654    0.010   63.542    0.000
##    .automatasebvlt    0.605    0.010   61.808    0.000
##    .kor               0.772    0.007  115.294    0.000
##    .ut                0.704    0.002  361.735    0.000
##    .dizel             0.908    0.023   39.626    0.000
##    .price             0.105    0.001  122.480    0.000
##     regi              1.000
lavaanPlot(sem8, coef=TRUE, sig=0.05)

Modifikációs index

modindices(sem8, sort=TRUE, maximum.number=10)
##                 lhs op              rhs        mi    epc sepc.lv sepc.all
## 78             regi  ~            price 19379.131 -2.302  -2.302   -2.085
## 81             regi  ~     teljesitmeny 18258.341 -0.758  -0.758   -0.758
## 68     teljesitmeny  ~ automatasebvalto 11416.728  0.413   0.413    0.413
## 69     teljesitmeny  ~             regi 10838.724 -0.518  -0.518   -0.518
## 75 automatasebvalto  ~     teljesitmeny 10245.536  0.398   0.398    0.398
## 72 automatasebvalto  ~            price  8752.407  0.899   0.899    0.814
## 80             regi  ~            tomeg  8602.663 -0.520  -0.520   -0.520
## 38               ut  ~            dizel  4031.779  0.234   0.234    0.234
## 65     teljesitmeny  ~            price  4029.860  0.319   0.319    0.289
## 36               ut ~~            dizel  3684.996  0.213   0.213    0.267
##    sepc.nox
## 78   -2.085
## 81   -0.758
## 68    0.413
## 69   -0.518
## 75    0.398
## 72    0.814
## 80   -0.520
## 38    0.234
## 65    0.289
## 36    0.267
mod9<-" ut~kor+dizel
        regi=~manualisklima+automatasebvalto+kor+ut
        dizel~tomeg
        teljesitmeny~~automatasebvalto
        teljesitmeny~regi
        price~teljesitmeny+ut+kor+tomeg+automatasebvalto+regi
       "
sem9 <- sem(mod9, data=car_st, estimator="MLF", std.lv=TRUE)
summary(sem9, fit=TRUE) #CFI:0.911; TLI:0.808; RMSEA:0.161
## lavaan 0.6-19 ended normally after 35 iterations
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                        22
## 
##   Number of observations                         52698
## 
## Model Test User Model:
##                                                        
##   Test statistic                              17686.765
##   Degrees of freedom                                 13
##   P-value (Chi-square)                            0.000
## 
## Model Test Baseline Model:
## 
##   Test statistic                            197919.598
##   Degrees of freedom                                28
##   P-value                                        0.000
## 
## User Model versus Baseline Model:
## 
##   Comparative Fit Index (CFI)                    0.911
##   Tucker-Lewis Index (TLI)                       0.808
## 
## Loglikelihood and Information Criteria:
## 
##   Loglikelihood user model (H0)            -433306.643
##   Loglikelihood unrestricted model (H1)    -424463.260
##                                                       
##   Akaike (AIC)                              866657.286
##   Bayesian (BIC)                            866852.478
##   Sample-size adjusted Bayesian (SABIC)     866782.561
## 
## Root Mean Square Error of Approximation:
## 
##   RMSEA                                          0.161
##   90 Percent confidence interval - lower         0.159
##   90 Percent confidence interval - upper         0.163
##   P-value H_0: RMSEA <= 0.050                    0.000
##   P-value H_0: RMSEA >= 0.080                    1.000
## 
## Standardized Root Mean Square Residual:
## 
##   SRMR                                           0.141
## 
## Parameter Estimates:
## 
##   Standard errors                             Standard
##   Information                              First.order
##   Information saturated (h1) model          Structured
## 
## Latent Variables:
##                    Estimate  Std.Err  z-value  P(>|z|)
##   regi =~                                             
##     manualisklima     0.618    0.008   81.359    0.000
##     automatasebvlt   -0.627    0.009  -68.775    0.000
##     kor               0.439    0.006   71.366    0.000
##     ut               -0.056    0.008   -6.851    0.000
## 
## Regressions:
##                    Estimate  Std.Err  z-value  P(>|z|)
##   ut ~                                                
##     kor               0.550    0.006   93.936    0.000
##     dizel             0.253    0.005   47.856    0.000
##   dizel ~                                             
##     tomeg             0.304    0.005   61.800    0.000
##   teljesitmeny ~                                      
##     regi             -0.641    0.009  -70.592    0.000
##   price ~                                             
##     teljesitmeny      0.260    0.003   75.390    0.000
##     ut               -0.094    0.002  -50.169    0.000
##     kor              -0.616    0.002 -247.511    0.000
##     tomeg             0.044    0.003   16.770    0.000
##     automatasebvlt    0.062    0.003   18.828    0.000
##     regi             -0.187    0.007  -26.945    0.000
## 
## Covariances:
##                       Estimate  Std.Err  z-value  P(>|z|)
##  .automatasebvalto ~~                                    
##    .teljesitmeny         0.180    0.008   23.091    0.000
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)
##    .manualisklima     0.618    0.011   58.693    0.000
##    .automatasebvlt    0.606    0.010   60.351    0.000
##    .kor               0.807    0.007  122.658    0.000
##    .ut                0.656    0.001  537.332    0.000
##    .dizel             0.908    0.023   39.676    0.000
##    .teljesitmeny      0.589    0.009   68.067    0.000
##    .price             0.105    0.001  117.250    0.000
##     regi              1.000
lavaanPlot(sem9, coef=TRUE, sig=0.05)
  1. modell
mod10<-" ut~kor+dizel
        regitech=~manualisklima+automatasebvalto+teljesitmeny+kor
        dizel~tomeg
        teljesitmeny~~automatasebvalto
        price~teljesitmeny+ut+kor+tomeg+automatasebvalto+regitech
       "
sem10 <- sem(mod10, data=car_st, estimator="MLF", std.lv=TRUE)
summary(sem10, fit=TRUE) #CFI:0.910; TLI:0.820; RMSEA:0.155
## lavaan 0.6-19 ended normally after 29 iterations
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                        21
## 
##   Number of observations                         52698
## 
## Model Test User Model:
##                                                        
##   Test statistic                              17792.294
##   Degrees of freedom                                 14
##   P-value (Chi-square)                            0.000
## 
## Model Test Baseline Model:
## 
##   Test statistic                            197919.598
##   Degrees of freedom                                28
##   P-value                                        0.000
## 
## User Model versus Baseline Model:
## 
##   Comparative Fit Index (CFI)                    0.910
##   Tucker-Lewis Index (TLI)                       0.820
## 
## Loglikelihood and Information Criteria:
## 
##   Loglikelihood user model (H0)            -433359.408
##   Loglikelihood unrestricted model (H1)    -424463.260
##                                                       
##   Akaike (AIC)                              866760.815
##   Bayesian (BIC)                            866947.134
##   Sample-size adjusted Bayesian (SABIC)     866880.396
## 
## Root Mean Square Error of Approximation:
## 
##   RMSEA                                          0.155
##   90 Percent confidence interval - lower         0.153
##   90 Percent confidence interval - upper         0.157
##   P-value H_0: RMSEA <= 0.050                    0.000
##   P-value H_0: RMSEA >= 0.080                    1.000
## 
## Standardized Root Mean Square Residual:
## 
##   SRMR                                           0.142
## 
## Parameter Estimates:
## 
##   Standard errors                             Standard
##   Information                              First.order
##   Information saturated (h1) model          Structured
## 
## Latent Variables:
##                    Estimate  Std.Err  z-value  P(>|z|)
##   regitech =~                                         
##     manualisklima     0.611    0.008   80.359    0.000
##     automatasebvlt   -0.635    0.009  -68.933    0.000
##     teljesitmeny     -0.647    0.009  -71.286    0.000
##     kor               0.438    0.006   71.061    0.000
## 
## Regressions:
##                    Estimate  Std.Err  z-value  P(>|z|)
##   ut ~                                                
##     kor               0.526    0.005  107.346    0.000
##     dizel             0.263    0.005   52.113    0.000
##   dizel ~                                             
##     tomeg             0.304    0.005   61.973    0.000
##   price ~                                             
##     teljesitmeny      0.257    0.004   71.253    0.000
##     ut               -0.089    0.002  -52.506    0.000
##     kor              -0.618    0.002 -269.379    0.000
##     tomeg             0.044    0.003   16.586    0.000
##     automatasebvlt    0.059    0.003   16.793    0.000
##     regitech         -0.193    0.007  -26.775    0.000
## 
## Covariances:
##                       Estimate  Std.Err  z-value  P(>|z|)
##  .automatasebvalto ~~                                    
##    .teljesitmeny         0.172    0.008   21.860    0.000
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)
##    .manualisklima     0.626    0.011   59.311    0.000
##    .automatasebvlt    0.597    0.010   59.467    0.000
##    .teljesitmeny      0.582    0.009   66.309    0.000
##    .kor               0.808    0.007  122.765    0.000
##    .ut                0.658    0.001  562.385    0.000
##    .dizel             0.908    0.023   39.652    0.000
##    .price             0.104    0.001  109.987    0.000
##     regitech          1.000
lavaanPlot(sem10, coef=TRUE, sig=0.05)
modindices(sem10, sort=TRUE, maximum.number=10)
##              lhs op              rhs        mi    epc sepc.lv sepc.all sepc.nox
## 68         tomeg  ~         regitech 11248.796 -0.570  -0.570   -0.570   -0.570
## 85      regitech  ~            tomeg 11248.796 -0.570  -0.570   -0.570   -0.570
## 66         tomeg  ~     teljesitmeny 11172.980  0.460   0.460    0.460    0.460
## 67         tomeg  ~ automatasebvalto  7135.442  0.368   0.368    0.368    0.368
## 64         tomeg  ~            price  5902.567  0.359   0.359    0.357    0.357
## 73  teljesitmeny  ~            tomeg  3757.263  0.210   0.210    0.210    0.210
## 82      regitech  ~            dizel  2680.470 -0.272  -0.272   -0.272   -0.272
## 28 manualisklima ~~            dizel  1410.271 -0.137  -0.137   -0.181   -0.181
## 83      regitech  ~            price  1392.265 -2.693  -2.693   -2.676   -2.676
## 65         tomeg  ~              kor  1245.664 -0.154  -0.154   -0.154   -0.154
mod11<-" ut~a*kor+dizel
        regitech=~manualisklima+automatasebvalto+teljesitmeny+kor
        dizel~tomeg
        tomeg~regitech+teljesitmeny
        teljesitmeny~~automatasebvalto
        price~teljesitmeny+b*ut+c*kor+tomeg+automatasebvalto+regitech
       "
sem11 <- sem(mod11, data=car_st, estimator="MLF", std.lv=TRUE)
summary(sem11, fit=TRUE) #CFI:0.982; TLI:0.958; RMSEA:0.075
## lavaan 0.6-19 ended normally after 30 iterations
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                        24
## 
##   Number of observations                         52698
## 
## Model Test User Model:
##                                                       
##   Test statistic                              3542.273
##   Degrees of freedom                                12
##   P-value (Chi-square)                           0.000
## 
## Model Test Baseline Model:
## 
##   Test statistic                            197919.598
##   Degrees of freedom                                28
##   P-value                                        0.000
## 
## User Model versus Baseline Model:
## 
##   Comparative Fit Index (CFI)                    0.982
##   Tucker-Lewis Index (TLI)                       0.958
## 
## Loglikelihood and Information Criteria:
## 
##   Loglikelihood user model (H0)            -501009.120
##   Loglikelihood unrestricted model (H1)    -499237.983
##                                                       
##   Akaike (AIC)                             1002066.240
##   Bayesian (BIC)                           1002279.176
##   Sample-size adjusted Bayesian (SABIC)    1002202.903
## 
## Root Mean Square Error of Approximation:
## 
##   RMSEA                                          0.075
##   90 Percent confidence interval - lower         0.073
##   90 Percent confidence interval - upper         0.077
##   P-value H_0: RMSEA <= 0.050                    0.000
##   P-value H_0: RMSEA >= 0.080                    0.000
## 
## Standardized Root Mean Square Residual:
## 
##   SRMR                                           0.040
## 
## Parameter Estimates:
## 
##   Standard errors                             Standard
##   Information                              First.order
##   Information saturated (h1) model          Structured
## 
## Latent Variables:
##                    Estimate  Std.Err  z-value  P(>|z|)
##   regitech =~                                         
##     manualisklima     0.611    0.008   81.378    0.000
##     automatasebvlt   -0.647    0.009  -68.498    0.000
##     teljesitmeny     -0.654    0.009  -69.066    0.000
##     kor               0.422    0.006   69.444    0.000
## 
## Regressions:
##                    Estimate  Std.Err  z-value  P(>|z|)
##   ut ~                                                
##     kor        (a)    0.526    0.005  107.033    0.000
##     dizel             0.263    0.005   51.986    0.000
##   dizel ~                                             
##     tomeg             0.304    0.019   15.891    0.000
##   tomeg ~                                             
##     regitech         -0.327    0.017  -19.781    0.000
##     teljestmny        0.246    0.011   22.395    0.000
##   price ~                                             
##     teljestmny        0.263    0.004   71.155    0.000
##     ut         (b)   -0.089    0.002  -52.169    0.000
##     kor        (c)   -0.620    0.002 -280.402    0.000
##     tomeg             0.014    0.004    3.898    0.000
##     autmtsbvlt        0.055    0.004   15.130    0.000
##     regitech         -0.206    0.008  -25.877    0.000
## 
## Covariances:
##                       Estimate  Std.Err  z-value  P(>|z|)
##  .automatasebvalto ~~                                    
##    .teljesitmeny         0.159    0.008   19.809    0.000
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)
##    .manualisklima     0.626    0.010   60.617    0.000
##    .automatasebvlt    0.581    0.010   59.219    0.000
##    .teljesitmeny      0.572    0.009   61.661    0.000
##    .kor               0.822    0.007  123.822    0.000
##    .ut                0.658    0.001  558.744    0.000
##    .dizel             0.908    0.053   17.189    0.000
##    .tomeg             0.727    0.006  117.779    0.000
##    .price             0.104    0.001  100.954    0.000
##     regitech          1.000
lavaanPlot(sem11, coef=TRUE, sig=0.05)
# a kor az autó használatának kezdete (üzembehelyezés(?)), így nem abszolút külső adottság, hanem több mindentől függhet
exp(coef(sem11)["c"]) # közvetlen hatása a kornak --> 46%-kal csökkenti
##         c 
## 0.5381191
exp(coef(sem11)["b"]*coef(sem11)["a"]) # --> 4,5%-kal csökkenti közvetetten a megtett úton keresztül
##        b 
## 0.954115
exp(coef(sem11)["c"]+coef(sem11)["b"]*coef(sem11)["a"]) # összességében 49%-kal csökkenti az ár várható értékét az autó kora
##         c 
## 0.5134275