rec_sys.knit

date: “2024-12-10”

#Libraries
library(sf)
library(ggplot2)
library(tmap)
library(dplyr)
library(dplyr)
library(janitor)    
library(lubridate)
library(tidyr)
library(htmltools)

#NEW TERRITORIES MAP

#hongkong shapefile
hk_districts <- st_read("/Users/apple/Downloads/Hong_Kong_18_Districts_3574690825663895789")

## Reading layer `HKDistrict18' from data source 
##   `/Users/apple/Downloads/Hong_Kong_18_Districts_3574690825663895789' using driver `ESRI Shapefile'
## Simple feature collection with 18 features and 4 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 12672060 ymin: 2529946 xmax: 12739620 ymax: 2579129
## Projected CRS: WGS 84 / Pseudo-Mercator

#map of all hongkong districts
ggplot(data = hk_districts) +
  geom_sf(aes(fill =ENAME)) +  
  labs(title = "Map of all Districts in Hong Kong",
       fill = "District") +
  theme_minimal()

# Districts in NTE and NTW regions
nte_districts <- c("SHA TIN", "TAI PO", "NORTH", "SAI KUNG") 
ntw_districts <- c("KWAI TSING", "TSUEN WAN", "TUEN MUN", "YUEN LONG")
# Subsetting the districts in NTE and NTW
nte_districts_data <- hk_districts[hk_districts$ENAME %in% nte_districts, ]
ntw_districts_data <- hk_districts[hk_districts$ENAME %in% ntw_districts, ]

# Assigning colors to the NTE and NTW districts
nte_colors <- c("#4CAF50", "#2196F3", "#00BCD4", "#FF9800")
ntw_colors <- c("#673AB7", "#E91E63", "#FFC107", "#FFC107")
nte_districts_data$color <- nte_colors
ntw_districts_data$color <- ntw_colors

# New territory map
ggplot() +
  geom_sf(data = nte_districts_data, aes(fill = color)) +
  geom_sf(data = ntw_districts_data, aes(fill = color)) +
  geom_sf_text(data = nte_districts_data, aes(label = ENAME), size = 3, color = "black", fontface = "bold") +
  geom_sf_text(data = ntw_districts_data, aes(label = ENAME), size = 3, color = "black", fontface = "bold") +
  scale_fill_identity() +
  labs(title = "Map of NTE and NTW Districts in Hong Kong") +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5),
    axis.text = element_blank(),
    axis.ticks = element_blank()
  )

#DATA CLEANING
#Check duplicates
#Duplicate Rows in Each Dataset
#install.packages("openxlsx")
library(openxlsx)
Traffic_data<- read.xlsx("/Users/apple/Downloads/Traffic.xlsx")
Housing_prices <- read.csv("/Users/apple/Downloads/Housing prices.csv")
School_data <- read.xlsx("/Users/apple/Downloads/School data.xlsx")
Census_data <- read.csv("/Users/apple/Downloads/Census and Social information.csv")
Housing_parking <- read.csv("/Users/apple/Downloads/Housing parking.csv")
sum(duplicated(Housing_prices))  # Count of duplicate rows in Housing_prices

## [1] 0

sum(duplicated(School_data))    # Count of duplicate rows in School_data

## [1] 0

sum(duplicated(Census_data))    # Count of duplicate rows in Census_data

## [1] 0

sum(duplicated(Traffic_data))   # Count of duplicate rows in Traffic_data

## [1] 0

#There are no duplicates```{r}

#MISSING VALUES
# Check for missing values and count them in each dataset
sum(is.na(Housing_prices))

## [1] 0

sum(is.na(School_data))

## [1] 0

sum(is.na(Census_data))

## [1] 0

sum(is.na(Traffic_data))

## [1] 0

#There are no missing values

#Normalizing categorical variables
#SCHOOL DATA

#Assign SESSION_RANK Based on Session Type
School_data <- School_data %>%
  mutate(SESSION_RANK = case_when(
    SESSION == "WHOLE DAY" ~ 10,
    SESSION == "P.M." ~ 7,
    SESSION == "A.M." ~ 4,
    SESSION == "EVENING" ~ 2,
    TRUE ~ 0  # In case there are any unexpected values
  ))

# View the updated SESSION_RANK column
table(School_data$SESSION_RANK)

## 
##    2    4    7   10 
##   14  779  731 2037

#English category rank of schools
#Assign RANK Based on English Category
School_data <- School_data %>%
  mutate(ENGLISH_CATEGORY_RANK = case_when(
    `ENGLISH.CATEGORY` %in% c("International Schools (Primary)", "International Schools (Secondary)") ~ 10,
    `ENGLISH.CATEGORY` %in% c("English Schools Foundation (Primary)", "English Schools Foundation (Secondary)") ~ 9,
    `ENGLISH.CATEGORY` == "Government Secondary Schools" ~ 8,
    `ENGLISH.CATEGORY` == "Aided Secondary Schools" ~ 7,
    `ENGLISH.CATEGORY` == "Direct Subsidy Scheme Secondary Schools" ~ 6,
    `ENGLISH.CATEGORY` == "Private Secondary Schools (Day/Evening)" ~ 5,
    `ENGLISH.CATEGORY` == "Aided Primary Schools" ~ 4,
    `ENGLISH.CATEGORY` == "Government Primary Schools" ~ 3,
    `ENGLISH.CATEGORY` %in% c("Kindergartens", "Kindergarten-cum-child Care Centres") ~ 1,
    TRUE ~ 0  # In case there are any unexpected values
  ))
head(School_data)

##     SCHOOL.NO.      ENGLISH.CATEGORY
## 1 115509000123 Aided Primary Schools
## 2 536547000223 Aided Primary Schools
## 3 113000000123 Aided Primary Schools
## 4 113417000223 Aided Primary Schools
## 5 512176000223 Aided Primary Schools
## 6 512176000123 Aided Primary Schools
##                                                    ENGLISH.NAME
## 1                              SHA TIN METHODIST PRIMARY SCHOOL
## 2                                    TUNG CHUNG CATHOLIC SCHOOL
## 3                                              TUNG KOON SCHOOL
## 4 THE CHURCH OF CHRIST IN CHINA CHUEN YUEN FIRST PRIMARY SCHOOL
## 5                                         HOP YAT CHURCH SCHOOL
## 6                                         HOP YAT CHURCH SCHOOL
##                                                                    ENGLISH.ADDRESS
## 1                                       KWONG YUEN ESTATE, SHATIN, NEW TERRITORIES
## 2   8 YAT TUNG STREET YAT TUNG (2) ESTATE TUNG CHUNG LANTAU ISLAND NEW TERRITORIES
## 3                                   JOCKEY CLUB ROAD, SHEUNG SHUI, NEW TERRITORIES
## 4 36 WING SHUN STREET, TSUEN WAN, NEW TERRITORIES (EXCLUDING 2/F VISUAL ARTS ROOM)
## 5                                                  6 PERTH STREET HOMANTIN KOWLOON
## 6                                        7 PRINCESS MARGARET ROAD HOMANTIN KOWLOON
##      LONGITUDE    LATITUDE       EASTING        NORTHING STUDENTS.GENDER   SESSION
## 1 114.21463726 22.38018009      840157.0        826775.0           CO-ED WHOLE DAY
## 2 113.93545348 22.28267348      811388.0        815997.0           CO-ED WHOLE DAY
## 3  114.1310215 22.50282065      831550.0        840356.0           CO-ED WHOLE DAY
## 4 114.11502938 22.36323017 829898.139065 824898.86077027           CO-ED WHOLE DAY
## 5 114.17944646 22.32079952      836533.0        820199.0           CO-ED WHOLE DAY
## 6  114.1793591 22.32051055      836524.0        820167.0           CO-ED WHOLE DAY
##       DISTRICT FINANCE.TYPE SCHOOL.LEVEL TELEPHONE FAX.NUMBER                WEBSITE
## 1      SHA TIN        AIDED      PRIMARY  26366533   26494329 http://www.smps.edu.hk
## 2      ISLANDS        AIDED      PRIMARY  21094962   21210166 http://www.tccs.edu.hk
## 3        NORTH        AIDED      PRIMARY  26700334   26684311  http://www.tks.edu.hk
## 4    TSUEN WAN        AIDED      PRIMARY  24900336   24158627                   N.A.
## 5 KOWLOON CITY        AIDED      PRIMARY  27111013   27142465 http://www.hycs.edu.hk
## 6 KOWLOON CITY        AIDED      PRIMARY  27111013   27142465 http://www.hycs.edu.hk
##                       RELIGION SESSION_RANK ENGLISH_CATEGORY_RANK
## 1 PROTESTANTISM / CHRISTIANITY           10                     4
## 2                  CATHOLICISM           10                     4
## 3               NOT APPLICABLE           10                     4
## 4 PROTESTANTISM / CHRISTIANITY           10                     4
## 5 PROTESTANTISM / CHRISTIANITY           10                     4
## 6 PROTESTANTISM / CHRISTIANITY           10                     4

#ENGLISH_CATEGORY_RANK column
table(School_data$ENGLISH_CATEGORY_RANK)

## 
##    0    1    3    4    5    6    7    8    9   10 
##  259 2248   33  428   49   59  365   30   18   72

#Rank based on finance type
#Assign RANK Based on Finance Type ---
School_data <- School_data %>%
  mutate(FINANCE_TYPE_RANK = case_when(
    `FINANCE.TYPE` == "ENGLISH SCHOOLS FOUNDATION" ~ 10,
    `FINANCE.TYPE` == "DIRECT SUBSIDY SCHEME" ~ 9,
    `FINANCE.TYPE` == "AIDED" ~ 8,
    `FINANCE.TYPE` == "GOVERNMENT" ~ 7,
    `FINANCE.TYPE` == "PRIVATE" ~ 5,
    `FINANCE.TYPE` == "PRIVATE INDEPENDENT SCH SCHEME" ~ 3,
    `FINANCE.TYPE` == "CAPUT" ~ 1,
    TRUE ~ 0  
  ))

# FINANCE_TYPE_RANK column
table(School_data$FINANCE_TYPE_RANK)

## 
##    1    3    5    7    8    9   10 
##    2   15 2426   63  955   82   18

# Assigning RANK Based on School Level 
School_data <- School_data %>%
  mutate(SCHOOL_LEVEL_RANK = case_when(
    `SCHOOL.LEVEL` == "SECONDARY" ~ 10,
    `SCHOOL.LEVEL` == "PRIMARY" ~ 8,
    `SCHOOL.LEVEL` == "KINDERGARTEN-CUM-CHILD CARE CENTRES" ~ 6,
    `SCHOOL.LEVEL` == "KINDERGARTEN" ~ 5,
    TRUE ~ 0  
  ))

# SCHOOL_LEVEL_RANK column
table(School_data$SCHOOL_LEVEL_RANK)

## 
##    5    6    8   10 
## 1253  995  692  621

#districts for NTE and NTW
nte_ntw_districts <- c("SAI KUNG", "SHA TIN", "TAI PO", "NORTH", "YUEN LONG", "KWAI TSING", "TSUEN WAN", "TUEN MUN")

# Filtering School_data to keep only rows where DISTRICT is in NTE or NTW
School_data <- School_data %>%
  filter(DISTRICT %in% nte_ntw_districts)

# Checking the filtered data
table(School_data$DISTRICT)

## 
## KWAI TSING      NORTH   SAI KUNG    SHA TIN     TAI PO  TSUEN WAN   TUEN MUN  YUEN LONG 
##        222        168        234        293        145        116        241        287

#Column for region
nte_districts <- c("SAI KUNG", "SHA TIN", "TAI PO", "NORTH
")
ntw_districts <- c("YUEN LONG", "KWAI TSING", "TSUEN WAN", "TUEN MUN")

School_data <- School_data %>%
  mutate(REGION = case_when(
    DISTRICT %in% nte_districts ~ "NTE",  # New Territories East
    DISTRICT %in% ntw_districts ~ "NTW",  # New Territories West
    TRUE ~ "Other"  
  ))
#School rank
# Combining all the individual ranks to generate a total score for each school
colnames(School_data)

##  [1] "SCHOOL.NO."            "ENGLISH.CATEGORY"      "ENGLISH.NAME"         
##  [4] "ENGLISH.ADDRESS"       "LONGITUDE"             "LATITUDE"             
##  [7] "EASTING"               "NORTHING"              "STUDENTS.GENDER"      
## [10] "SESSION"               "DISTRICT"              "FINANCE.TYPE"         
## [13] "SCHOOL.LEVEL"          "TELEPHONE"             "FAX.NUMBER"           
## [16] "WEBSITE"               "RELIGION"              "SESSION_RANK"         
## [19] "ENGLISH_CATEGORY_RANK" "FINANCE_TYPE_RANK"     "SCHOOL_LEVEL_RANK"    
## [22] "REGION"

School_data <- School_data %>%
  mutate(
    TOTAL_RANK = SESSION_RANK + ENGLISH_CATEGORY_RANK + FINANCE_TYPE_RANK + SCHOOL_LEVEL_RANK
  )

#  Ranking the schools based on the total rank
School_data <- School_data %>%
  arrange(desc(TOTAL_RANK)) %>%
  mutate(RANK = row_number())

#top 10 schools based on the rank
top_schools <- School_data %>%
  select(`ENGLISH.NAME`, DISTRICT, REGION, RANK) %>%  
  slice_head(n = 50)
# View the top 3 schools
head(top_schools, n = 50)

##                                                                                                                        ENGLISH.NAME
## 1                                                                                                                    SHATIN COLLEGE
## 2                                                                                                                    SHATIN COLLEGE
## 3                                                                                                             CLEARWATER BAY SCHOOL
## 4                                                                                                              SHATIN JUNIOR SCHOOL
## 5                                                                                                             CLEARWATER BAY SCHOOL
## 6                                                                                     CARITAS FANLING CHAN CHUN HA SECONDARY SCHOOL
## 7                                                                                            BAPTIST LUI MING CHOI SECONDARY SCHOOL
## 8                                                                                      CHRISTIAN ALLIANCE S W CHAN MEMORIAL COLLEGE
## 9                                                       TEXTILE INSTITUTE AMERICAN CHAMBER OF COMMERCE WOO HON FAI SECONDARY SCHOOL
## 10                                                                                          BUDDHIST SUM HEUNG LAM MEMORIAL COLLEGE
## 11                                                                                                    PO LEUNG KUK C.W. CHU COLLEGE
## 12                                                                         CHRISTIAN & MISSIONARY ALLIANCE SUN KEI SECONDARY SCHOOL
## 13                                                                                                   FUNG KAI NO.1 SECONDARY SCHOOL
## 14                                                                                     CARMEL BUNNAN TONG MEMORIAL SECONDARY SCHOOL
## 15                                                          DAUGHTERS OF MARY HELP OF CHRISTIANS SIU MING CATHOLIC SECONDARY SCHOOL
## 16                                                                               LIONS CLUBS INTERNATIONAL TSEUNG CHUI KING COLLEGE
## 17                                                                                                        SHA TIN METHODIST COLLEGE
## 18                                                                                          KWOK TAK SENG CATHOLIC SECONDARY SCHOOL
## 19                                                                                                          SHATIN PUI YING COLLEGE
## 20                                                         CHENG CHEK CHEE SECONDARY SCHOOL OF SAI KUNG AND HANG HAU DISTRICT, N.T.
## 21                                                                                                 IMMACULATE HEART OF MARY COLLEGE
## 22                                                                                               SHEK LEI CATHOLIC SECONDARY SCHOOL
## 23                                                                           LINGNAN DR. CHUNG WING KWONG MEMORIAL SECONDARY SCHOOL
## 24                                                                         TUNG WAH GROUP OF HOSPITALS LO KON TING MEMORIAL COLLEGE
## 25                                                                                     SHENG KUNG HUI BISHOP BAKER SECONDARY SCHOOL
## 26                                                              MA KAM MING CHARITABLE FOUNDATION MA CHAN DUEN HEY MEMORIAL COLLEGE
## 27                                                                                                     KWAI CHUNG METHODIST COLLEGE
## 28                                                                        YAN CHAI HOSPITAL TUNG CHI YING MEMORIAL SECONDARY SCHOOL
## 29                                                                      THE CHURCH OF CHRIST IN CHINA FONG YUN WAH SECONDARY SCHOOL
## 30                                                                                  CARMEL DIVINE GRACE FOUNDATION SECONDARY SCHOOL
## 31                                                                                              CATHOLIC MING YUEN SECONDARY SCHOOL
## 32                                                                          CUMBERLAND PRESBYTERIAN CHURCH YAO DAO SECONDARY SCHOOL
## 33                                                                                   SHENG KUNG HUI TSANG SHIU TIM SECONDARY SCHOOL
## 34                                                 YUEN LONG PUBLIC MIDDLE SCHOOL ALUMNI ASSOCIATION TANG SIU TONG SECONDARY SCHOOL
## 35                                                                                          ASSEMBLY OF GOD HEBRON SECONDARY SCHOOL
## 36                                                                                                        LOCK TAO SECONDARY SCHOOL
## 37                                                                                                    BUDDHIST WONG WAN TIN COLLEGE
## 38                                                                                              PUI SHING CATHOLIC SECONDARY SCHOOL
## 39                                                       HONG KONG TAOIST ASSOCIATION THE YUEN YUEN INSTITUTE NO.2 SECONDARY SCHOOL
## 40                                                                                                               BETHEL HIGH SCHOOL
## 41                                                                                      SHENG KUNG HUI LAM KAU MOW SECONDARY SCHOOL
## 42                                                                                                                 VALTORTA COLLEGE
## 43                                                                                                    PO LEUNG KUK WU CHUNG COLLEGE
## 44                                                                                                                  KAU YAN COLLEGE
## 45                                                                                        POK OI HOSPITAL CHAN KAI MEMORIAL COLLEGE
## 46                                                                                   THE CHURCH OF CHRIST IN CHINA KEI YUEN COLLEGE
## 47                                                                  HONG KONG TAOIST ASSOCIATION TANG HIN MEMORIAL SECONDARY SCHOOL
## 48 CHRISTIAN ALLIANCE CHENG WING GEE COLLEGE OF THE KOWLOON TONG CHURCH OF THE CHINESE CHRISTIAN AND MISSIONARY ALLIANCE, HONG KONG
## 49                                                                                 TUNG WAH GROUP OF HOSPITALS YOW KAM YUEN COLLEGE
## 50                                                                                                     TOI SHAN ASSOCIATION COLLEGE
##      DISTRICT REGION RANK
## 1     SHA TIN    NTE    1
## 2     SHA TIN    NTE    2
## 3    SAI KUNG    NTE    3
## 4     SHA TIN    NTE    4
## 5    SAI KUNG    NTE    5
## 6       NORTH  Other    6
## 7     SHA TIN    NTE    7
## 8       NORTH  Other    8
## 9   TSUEN WAN    NTW    9
## 10   TUEN MUN    NTW   10
## 11    SHA TIN    NTE   11
## 12   SAI KUNG    NTE   12
## 13      NORTH  Other   13
## 14   TUEN MUN    NTW   14
## 15 KWAI TSING    NTW   15
## 16 KWAI TSING    NTW   16
## 17    SHA TIN    NTE   17
## 18    SHA TIN    NTE   18
## 19    SHA TIN    NTE   19
## 20   SAI KUNG    NTE   20
## 21    SHA TIN    NTE   21
## 22 KWAI TSING    NTW   22
## 23 KWAI TSING    NTW   23
## 24  YUEN LONG    NTW   24
## 25  YUEN LONG    NTW   25
## 26   SAI KUNG    NTE   26
## 27 KWAI TSING    NTW   27
## 28    SHA TIN    NTE   28
## 29  YUEN LONG    NTW   29
## 30   SAI KUNG    NTE   30
## 31   SAI KUNG    NTE   31
## 32  YUEN LONG    NTW   32
## 33    SHA TIN    NTE   33
## 34  YUEN LONG    NTW   34
## 35     TAI PO    NTE   35
## 36    SHA TIN    NTE   36
## 37    SHA TIN    NTE   37
## 38  YUEN LONG    NTW   38
## 39     TAI PO    NTE   39
## 40  YUEN LONG    NTW   40
## 41    SHA TIN    NTE   41
## 42     TAI PO    NTE   42
## 43    SHA TIN    NTE   43
## 44     TAI PO    NTE   44
## 45    SHA TIN    NTE   45
## 46  YUEN LONG    NTW   46
## 47      NORTH  Other   47
## 48    SHA TIN    NTE   48
## 49    SHA TIN    NTE   49
## 50    SHA TIN    NTE   50

##Housing parking
colnames(Housing_parking)

## [1] "District"                                                                                      
## [2] "Location"                                                                                      
## [3] "Types.of.Operating.Hours"                                                                      
## [4] "For.Vehicles.Other.Than.Medium.and.Heavy.Goods.Vehicles..Buses..Motor.Cycles.and.Pedal.Cycles."
## [5] "For.Goods.Vehicles"                                                                            
## [6] "For.Coaches"

nte_districts <- c("SAI KUNG", "SHA TIN", "TAI PO", "NORTH")
ntw_districts <- c("YUEN LONG", "KWAI TSING", "TSUEN WAN", "TUEN MUN")
Housing_parking <- Housing_parking %>%
  mutate(
    Total_Parking = `For.Vehicles.Other.Than.Medium.and.Heavy.Goods.Vehicles..Buses..Motor.Cycles.and.Pedal.Cycles.` + 
      `For.Goods.Vehicles` + 
      `For.Coaches`,  # Calculate total parking spaces
    REGION = case_when(
      District %in% nte_districts ~ "NTE",   
      District %in% ntw_districts ~ "NTW",   
      TRUE ~ "Other"  # Assign "Other" for districts not in NTE or NTW
    )
  ) %>%
  group_by(District, REGION) %>%  # Group by District and REGION
  summarise(
    Total_Parking = sum(Total_Parking, na.rm = TRUE),  # Sum parking spaces by district and region
    .groups = "drop"  # Remove grouping after summarization
  ) %>%
  arrange(REGION, desc(Total_Parking))  # Sort by REGION and then by total parking

# View the result
print(Housing_parking)

## # A tibble: 8 × 3
##   District   REGION Total_Parking
##   <chr>      <chr>          <int>
## 1 SHA TIN    NTE             1587
## 2 TAI PO     NTE             1403
## 3 SAI KUNG   NTE             1203
## 4 NORTH      NTE             1097
## 5 TUEN MUN   NTW             1171
## 6 YUEN LONG  NTW              957
## 7 KWAI TSING NTW              509
## 8 TSUEN WAN  NTW              506

####################Traffic data
colnames(Traffic_data)

## [1] "Date"                "District"            "Control.Point"      
## [4] "Arrival./.Departure" "Hong.Kong.Residents" "Mainland.Visitors"  
## [7] "Other.Visitors"      "Total"

nte_districts <- c("SAI KUNG", "SHA TIN", "TAI PO", "NORTH")
ntw_districts <- c("YUEN LONG", "KWAI TSING", "TSUEN WAN", "TUEN MUN")

# Add REGION column based on districts
Traffic <- Traffic_data %>%
  mutate(REGION = case_when(
    District %in% nte_districts ~ "NTE",
    District %in% ntw_districts ~ "NTW",
    TRUE ~ "Other"
  ))

# Summarize total traffic (including all visitors) by District and Control Point
Traffic_summary <- Traffic %>%
  group_by(District, "Control Point", REGION) %>%
  summarise(
    Total_HongKong_Residents = sum(`Hong.Kong.Residents`, na.rm = TRUE),
    Total_Mainland_Visitors = sum(`Mainland.Visitors`, na.rm = TRUE),
    Total_Other_Visitors = sum(`Other.Visitors`, na.rm = TRUE),
    Total_Traffic = sum(Total, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(REGION, desc(Total_Traffic))  

# Summarize total traffic (arrivals and departures) by District
Traffic_summary <- Traffic %>%
  mutate(REGION = case_when(
    District %in% nte_districts ~ "NTE",
    District %in% ntw_districts ~ "NTW",
    TRUE ~ "Other"
  )) %>%
  group_by(District, REGION) %>%
  summarise(
    Total_HongKong_Residents_Arrivals = sum(`Hong.Kong.Residents`[`Arrival./.Departure` == "Arrival"], na.rm = TRUE),
    Total_HongKong_Residents_Departures = sum(`Hong.Kong.Residents`[`Arrival./.Departure` == "Departure"], na.rm = TRUE),
    Total_Mainland_Visitors_Arrivals = sum(`Mainland.Visitors`[`Arrival./.Departure` == "Arrival"], na.rm = TRUE),
    Total_Mainland_Visitors_Departures = sum(`Mainland.Visitors`[`Arrival./.Departure` == "Departure"], na.rm = TRUE),
    Total_Other_Visitors_Arrivals = sum(`Other.Visitors`[`Arrival./.Departure` == "Arrival"], na.rm = TRUE),
    Total_Other_Visitors_Departures = sum(`Other.Visitors`[`Arrival./.Departure` == "Departure"], na.rm = TRUE),
    Total_Arrivals = sum(Total[`Arrival./.Departure` == "Arrival"], na.rm = TRUE),
    Total_Departures = sum(Total[`Arrival./.Departure` == "Departure"], na.rm = TRUE),
    Total_Traffic = sum(Total, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(REGION, desc(Total_Traffic))  # Sort by REGION and then by total traffic
# View the result
print(Traffic_summary)

## # A tibble: 2 × 11
##   District REGION Total_HongKong_Residents…¹ Total_HongKong_Resid…² Total_Mainland_Visit…³
##   <chr>    <chr>                       <dbl>                  <dbl>                  <dbl>
## 1 NORTH    NTE                      98817538               98766738               26783604
## 2 TUEN MUN NTW                             0                      0                      0
## # ℹ abbreviated names: ¹Total_HongKong_Residents_Arrivals,
## #   ²Total_HongKong_Residents_Departures, ³Total_Mainland_Visitors_Arrivals
## # ℹ 6 more variables: Total_Mainland_Visitors_Departures <dbl>,
## #   Total_Other_Visitors_Arrivals <dbl>, Total_Other_Visitors_Departures <dbl>,
## #   Total_Arrivals <dbl>, Total_Departures <dbl>, Total_Traffic <dbl>

###############
###############Housing prices
# Reshape the dataset from wide to long format
Housing_long <- Housing_prices %>%
  pivot_longer(cols = `KWAI.TSING`:`YUEN.LONG`,
               names_to = "District", 
               values_to = "Housing_Price") %>%
  arrange(Year, District)

# View the reshaped dataset
print(Housing_long)

## # A tibble: 200 × 3
##     Year District   Housing_Price
##    <int> <chr>              <int>
##  1  1999 KWAI.TSING         41861
##  2  1999 NORTH              35735
##  3  1999 SAI.KUNG           49287
##  4  1999 SHA.TIN            35471
##  5  1999 TAI.PO             37095
##  6  1999 TSUEN.WAN          35042
##  7  1999 TUEN.MUN           60751
##  8  1999 YUEN.LONG          43952
##  9  2000 KWAI.TSING         35975
## 10  2000 NORTH              31444
## # ℹ 190 more rows

# Define NTE and NTW districts
colnames(Housing_long)

## [1] "Year"          "District"      "Housing_Price"

nte_districts <- c("SAI KUNG", "SHA TIN", "TAI PO", "NORTH")
ntw_districts <- c("YUEN LONG", "KWAI TSING", "TSUEN WAN", "TUEN MUN")

# Add a column for REGION
Housing_long <- Housing_long %>%
  mutate(REGION = case_when(
    District %in% nte_districts ~ "NTE",   
    District %in% ntw_districts ~ "NTW",   
    TRUE ~ "Other"  
  ))

ggplot(Housing_long, aes(x = Year, y = Housing_Price, color = REGION, group = District)) +
  geom_line(size = 1.2) +  # Bold lines
  geom_text(data = Housing_long %>%
              group_by(District) %>%
              filter(Year == max(Year)),  # Filter to get last year for each district
            aes(label = District), 
            vjust = -0.5, size = 3, check_overlap = TRUE) +
  labs(title = "Housing Price Trends by District (1999-2023)", 
       x = "Year", 
       y = "Housing Price (HKD)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

# Define the districts for NTE and NTW
nte_districts <- c("SAI KUNG", "SHA TIN", "TAI PO", "NORTH")
ntw_districts <- c("YUEN LONG", "KWAI TSING", "TSUEN WAN", "TUEN MUN")

# average housing price for each district, region, and rank by price
Housing_summary <- Housing_prices %>%
  pivot_longer(cols = starts_with("KWAI.TSING"):starts_with("YUEN.LONG"),
               names_to = "District", 
               values_to = "Housing_Price") %>%
  group_by(District) %>%
  summarise(Average_Housing_Price = mean(Housing_Price, na.rm = TRUE)) %>%
  mutate(REGION = case_when(
    District %in% nte_districts ~ "NTE",   
    District %in% ntw_districts ~ "NTW",   
    TRUE ~ "Other"  
  )) %>%
  arrange(Average_Housing_Price)  # Sorting by Average_Housing_Price (ascending)

# View the result
print(Housing_summary)

## # A tibble: 8 × 3
##   District   Average_Housing_Price REGION
##   <chr>                      <dbl> <chr> 
## 1 TSUEN.WAN                 69170. Other 
## 2 NORTH                     75031. NTE   
## 3 SHA.TIN                   81318  Other 
## 4 TAI.PO                    86091. Other 
## 5 KWAI.TSING                99885. Other 
## 6 SAI.KUNG                 104439. Other 
## 7 YUEN.LONG                110206. Other 
## 8 TUEN.MUN                 127430. Other

##################Census and social  information
# Define the districts for NTE and NTW in capital letters
nte_districts <- c("SAI KUNG", "SHA TIN", "TAI PO", "NORTH")
ntw_districts <- c("YUEN LONG", "KWAI TSING", "TSUEN WAN", "TUEN MUN")
colnames(Census_data)

##  [1] "DISTRICT"                      "Total.population"             
##  [3] "Male"                          "Female"                       
##  [5] "X..15"                         "X15...24"                     
##  [7] "X25...44"                      "X45...64"                     
##  [9] "X65."                          "Divorces"                     
## [11] "English.speaking"              "No.schooling..Pre.primary"    
## [13] "Post.secondary..Degree.course" "Study.in.the.same.district.3."
## [15] "Working.population"            "Employers"

# Add the REGION column to Census_data, considering the capitalized 'DISTRICT' column name
Census_data <- Census_data %>%
  mutate(REGION = case_when(
    `DISTRICT` %in% nte_districts ~ "NTE",   
    `DISTRICT` %in% ntw_districts ~ "NTW",   
    TRUE ~ "Other"  
  ))


#Friendly neighbourhood is the one with many children
# age distribution visualization
# age group data to compare across districts
Census_data %>%
  gather(key = "Age_Group", value = "Population", "X..15", "X15...24", "X25...44", "X45...64", "X65.") %>%
  ggplot(aes(x = DISTRICT, y = Population, fill = Age_Group)) +
  geom_bar(stat = "identity", position = "stack") +
  labs(title = "Age Group Distribution by District", x = "District", y = "Population")

#Education
# percentage of population with post-secondary education
Census_data %>%
  mutate(Post_Secondary_Percentage = (`Post.secondary..Degree.course` / `Total.population`) * 100) %>%
  ggplot(aes(x = DISTRICT, y = Post_Secondary_Percentage, fill = REGION)) +
  geom_bar(stat = "identity") +
  labs(title = "Post-Secondary Education Percentage by District", x = "District", y = "Percentage (%)")

#working population and employers
# Plot of working population and employers by district
ggplot(Census_data, aes(x = DISTRICT, y = `Working.population`, fill = REGION)) +
  geom_bar(stat = "identity") +
  labs(title = "Working Population by District", x = "District", y = "Working Population")

# social dynamics
# Plot divorce rates by district
ggplot(Census_data, aes(x = DISTRICT, y = Divorces, fill = REGION)) +
  geom_bar(stat = "identity") +
  labs(title = "Divorce Rates by District", x = "District", y = "Divorces")

Census_data <- Census_data %>%
  mutate(
    Post_Secondary_Percentage = (`Post.secondary..Degree.course` / `Total.population`) * 100,
    Rank_Score = (`Working.population` * 0.6) + (Post_Secondary_Percentage * 0.4)
  ) %>%
  arrange(desc(Rank_Score))
# Compare the number of working population in NTE and NTW
ggplot(Census_data, aes(x = REGION, y = `Working.population`, fill = REGION)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Working Population by Region (NTE vs NTW)", x = "Region", y = "Working Population")

####Cluster analysis
# Select relevant columns for clustering
clustering_data <- Census_data %>%
  select(`Total.population`, `Post.secondary..Degree.course`, `Working.population`, Divorces)

# Normalize data for clustering
clustering_data_scaled <- scale(clustering_data)

# Perform k-means clustering (you can experiment with k)
set.seed(123)
kmeans_result <- kmeans(clustering_data_scaled, centers = 3)

# Add cluster to the original data
Census_data$Cluster <- kmeans_result$cluster

# Visualize clustering
ggplot(Census_data, aes(x = `Working.population`, y = `Post.secondary..Degree.course`, color = as.factor(Cluster))) +
  geom_point() +
  labs(title = "Cluster Analysis of Districts", x = "Working Population", y = "Post-Secondary Education")

#MERGE DATASET
# Standardizing column names for consistency
colnames(Census_data)[colnames(Census_data) == "DISTRICT"] <- "District"
colnames(Housing_parking)[colnames(Housing_parking) == "District"] <- "District"
colnames(Housing_long)[colnames(Housing_long) == "District"] <- "District"
colnames(Traffic)[colnames(Traffic) == "District"] <- "District"
colnames(School_data)[colnames(School_data) == "DISTRICT"] <- "District"
colnames(Census_data)[colnames(Census_data) == "District"] <- "District"

# Function to calculate mode
get_mode <- function(x) {
  ux <- unique(na.omit(x))
  ux[which.max(tabulate(match(x, ux)))]
}

# Aggregating School_data by District
School_data_agg <- School_data %>%
  select(-c(ENGLISH.NAME, ENGLISH.ADDRESS, LONGITUDE, LATITUDE, EASTING, NORTHING, TELEPHONE, FAX.NUMBER, WEBSITE, RELIGION)) %>%
  group_by(District) %>%
  summarize(
    Avg_SCHOOL_NO = mean(as.numeric(SCHOOL.NO.), na.rm = TRUE), 
    Most_Common_ENGLISH_CATEGORY = get_mode(ENGLISH.CATEGORY),
    Avg_SESSION_RANK = mean(as.numeric(SESSION_RANK), na.rm = TRUE),
    Avg_ENGLISH_CATEGORY_RANK = mean(as.numeric(ENGLISH_CATEGORY_RANK), na.rm = TRUE),
    Avg_FINANCE_TYPE_RANK = mean(as.numeric(FINANCE_TYPE_RANK), na.rm = TRUE),
    Avg_SCHOOL_LEVEL_RANK = mean(as.numeric(SCHOOL_LEVEL_RANK), na.rm = TRUE),
    Avg_TOTAL_RANK = mean(as.numeric(TOTAL_RANK), na.rm = TRUE),
    Avg_RANK = mean(as.numeric(RANK), na.rm = TRUE),
    Most_Common_STUDENTS_GENDER = get_mode(STUDENTS.GENDER),
    Most_Common_SESSION = get_mode(SESSION),
    Most_Common_FINANCE_TYPE = get_mode(FINANCE.TYPE),
    Most_Common_SCHOOL_LEVEL = get_mode(SCHOOL.LEVEL)
  )

# View the first few rows of the aggregated School_data
head(School_data_agg)

## # A tibble: 6 × 13
##   District   Avg_SCHOOL_NO Most_Common_ENGLISH_C…¹ Avg_SESSION_RANK Avg_ENGLISH_CATEGORY…²
##   <chr>              <dbl> <chr>                              <dbl>                  <dbl>
## 1 KWAI TSING 344182346973. Kindergartens                       8.15                   2.18
## 2 NORTH      357500863242. Kindergartens                       8.11                   2.31
## 3 SAI KUNG   464819423238. Kindergarten-cum-child…             7.79                   2.18
## 4 SHA TIN    350165730508. Kindergartens                       8.08                   2.47
## 5 TAI PO     334602400139. Kindergartens                       8.32                   2.66
## 6 TSUEN WAN  371665396702. Kindergarten-cum-child…             7.85                   2.40
## # ℹ abbreviated names: ¹Most_Common_ENGLISH_CATEGORY, ²Avg_ENGLISH_CATEGORY_RANK
## # ℹ 8 more variables: Avg_FINANCE_TYPE_RANK <dbl>, Avg_SCHOOL_LEVEL_RANK <dbl>,
## #   Avg_TOTAL_RANK <dbl>, Avg_RANK <dbl>, Most_Common_STUDENTS_GENDER <chr>,
## #   Most_Common_SESSION <chr>, Most_Common_FINANCE_TYPE <chr>,
## #   Most_Common_SCHOOL_LEVEL <chr>

# Merge step-by-step
merged_data <- Census_data %>%
  left_join(Housing_parking, by = "District")

gc() # Clear memory

##           used  (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
## Ncells 3889291 207.8    6434728 343.7         NA  6434728 343.7
## Vcells 6944985  53.0   14786712 112.9      16384 10146326  77.5

merged_data <- merged_data %>%
  left_join(Housing_long, by = "District")

gc() # Clear memory

##           used  (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
## Ncells 3889307 207.8    6434728 343.7         NA  6434728 343.7
## Vcells 6945409  53.0   14786712 112.9      16384 10146326  77.5

merged_data <- merged_data %>%
  left_join(Traffic, by = "District")

## Warning in left_join(., Traffic, by = "District"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 4 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to
##   silence this warning.

gc() # Clear memory

##            used  (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
## Ncells  3892156 207.9    6434728 343.7         NA  6434728 343.7
## Vcells 15418564 117.7   24276887 185.3      16384 15798978 120.6

merged_data <- merged_data %>%
  left_join(School_data_agg, by = "District")

gc() # Clear memory

##            used  (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
## Ncells  3892186 207.9    6434728 343.7         NA  6434728 343.7
## Vcells 19742371 150.7   31621539 241.3      16384 28590010 218.2

tail(merged_data)

##        District Total.population   Male Female X..15 X15...24 X25...44 X45...64  X65.
## 360307    NORTH           309631 145343 164288 33499    27654    88314   104607 55557
## 360308    NORTH           309631 145343 164288 33499    27654    88314   104607 55557
## 360309    NORTH           309631 145343 164288 33499    27654    88314   104607 55557
## 360310    NORTH           309631 145343 164288 33499    27654    88314   104607 55557
## 360311    NORTH           309631 145343 164288 33499    27654    88314   104607 55557
## 360312    NORTH           309631 145343 164288 33499    27654    88314   104607 55557
##        Divorces English.speaking No.schooling..Pre.primary Post.secondary..Degree.course
## 360307    17410             4330                     11971                         48482
## 360308    17410             4330                     11971                         48482
## 360309    17410             4330                     11971                         48482
## 360310    17410             4330                     11971                         48482
## 360311    17410             4330                     11971                         48482
## 360312    17410             4330                     11971                         48482
##        Study.in.the.same.district.3. Working.population Employers REGION.x
## 360307                         23134             146416      2622      NTE
## 360308                         23134             146416      2622      NTE
## 360309                         23134             146416      2622      NTE
## 360310                         23134             146416      2622      NTE
## 360311                         23134             146416      2622      NTE
## 360312                         23134             146416      2622      NTE
##        Post_Secondary_Percentage Rank_Score Cluster REGION.y Total_Parking Year
## 360307                  15.65799   87855.86       2      NTE          1097 2023
## 360308                  15.65799   87855.86       2      NTE          1097 2023
## 360309                  15.65799   87855.86       2      NTE          1097 2023
## 360310                  15.65799   87855.86       2      NTE          1097 2023
## 360311                  15.65799   87855.86       2      NTE          1097 2023
## 360312                  15.65799   87855.86       2      NTE          1097 2023
##        Housing_Price REGION.x.x  Date  Control.Point Arrival./.Departure
## 360307        130015        NTE 45516 Heung Yuen Wai             Arrival
## 360308        130015        NTE 45516 Heung Yuen Wai           Departure
## 360309        130015        NTE 45516    Lok Ma Chau             Arrival
## 360310        130015        NTE 45516    Lok Ma Chau           Departure
## 360311        130015        NTE 45516    Sha Tau Kok             Arrival
## 360312        130015        NTE 45516    Sha Tau Kok           Departure
##        Hong.Kong.Residents Mainland.Visitors Other.Visitors Total REGION.y.y
## 360307               43866             13650            227 57743        NTE
## 360308               33769              7563            232 41564        NTE
## 360309               27305              3390            386 31081        NTE
## 360310               13307              6242            608 20157        NTE
## 360311                   0                 0              0     0        NTE
## 360312                   0                 0              0     0        NTE
##        Avg_SCHOOL_NO Most_Common_ENGLISH_CATEGORY Avg_SESSION_RANK
## 360307  357500863242                Kindergartens         8.113095
## 360308  357500863242                Kindergartens         8.113095
## 360309  357500863242                Kindergartens         8.113095
## 360310  357500863242                Kindergartens         8.113095
## 360311  357500863242                Kindergartens         8.113095
## 360312  357500863242                Kindergartens         8.113095
##        Avg_ENGLISH_CATEGORY_RANK Avg_FINANCE_TYPE_RANK Avg_SCHOOL_LEVEL_RANK
## 360307                  2.309524              6.005952              6.571429
## 360308                  2.309524              6.005952              6.571429
## 360309                  2.309524              6.005952              6.571429
## 360310                  2.309524              6.005952              6.571429
## 360311                  2.309524              6.005952              6.571429
## 360312                  2.309524              6.005952              6.571429
##        Avg_TOTAL_RANK Avg_RANK Most_Common_STUDENTS_GENDER Most_Common_SESSION
## 360307             23  866.375                       CO-ED           WHOLE DAY
## 360308             23  866.375                       CO-ED           WHOLE DAY
## 360309             23  866.375                       CO-ED           WHOLE DAY
## 360310             23  866.375                       CO-ED           WHOLE DAY
## 360311             23  866.375                       CO-ED           WHOLE DAY
## 360312             23  866.375                       CO-ED           WHOLE DAY
##        Most_Common_FINANCE_TYPE Most_Common_SCHOOL_LEVEL
## 360307                  PRIVATE             KINDERGARTEN
## 360308                  PRIVATE             KINDERGARTEN
## 360309                  PRIVATE             KINDERGARTEN
## 360310                  PRIVATE             KINDERGARTEN
## 360311                  PRIVATE             KINDERGARTEN
## 360312                  PRIVATE             KINDERGARTEN

# View duplicated rows

duplicated_rows <- merged_data[duplicated(merged_data$District), ]
View(duplicated_rows)

#  remove duplicates
#merged_data <- merged_data %>% distinct()

# Aggregate School Data 
#merged_data <- merged_data %>%
  #group_by(District) %>%
   #summarize(across(everything(), ~ if(is.numeric(.)) mean(., na.rm = TRUE) else first(.)))

# Check for missing values
colSums(is.na(merged_data))

##                      District              Total.population                          Male 
##                             0                             0                             0 
##                        Female                         X..15                      X15...24 
##                             0                             0                             0 
##                      X25...44                      X45...64                          X65. 
##                             0                             0                             0 
##                      Divorces              English.speaking     No.schooling..Pre.primary 
##                             0                             0                             0 
## Post.secondary..Degree.course Study.in.the.same.district.3.            Working.population 
##                             0                             0                             0 
##                     Employers                      REGION.x     Post_Secondary_Percentage 
##                             0                             0                             0 
##                    Rank_Score                       Cluster                      REGION.y 
##                             0                             0                             0 
##                 Total_Parking                          Year                 Housing_Price 
##                             0                           812                           812 
##                    REGION.x.x                          Date                 Control.Point 
##                           812                             6                             6 
##           Arrival./.Departure           Hong.Kong.Residents             Mainland.Visitors 
##                             6                             6                             6 
##                Other.Visitors                         Total                    REGION.y.y 
##                             6                             6                             6 
##                 Avg_SCHOOL_NO  Most_Common_ENGLISH_CATEGORY              Avg_SESSION_RANK 
##                             0                             0                             0 
##     Avg_ENGLISH_CATEGORY_RANK         Avg_FINANCE_TYPE_RANK         Avg_SCHOOL_LEVEL_RANK 
##                             0                             0                             0 
##                Avg_TOTAL_RANK                      Avg_RANK   Most_Common_STUDENTS_GENDER 
##                             0                             0                             0 
##           Most_Common_SESSION      Most_Common_FINANCE_TYPE      Most_Common_SCHOOL_LEVEL 
##                             0                             0                             0

# Visualize missing data
library(naniar)
#vis_miss(merged_data)  
# Impute missing values with the mean 
# Impute missing values for numerical columns with the mean
merged_data <- merged_data %>%
  mutate(
    Total_Parking = ifelse(is.na(Total_Parking), mean(Total_Parking, na.rm = TRUE), Total_Parking),
    Housing_Price = ifelse(is.na(Housing_Price), mean(Housing_Price, na.rm = TRUE), Housing_Price),
    Avg_SCHOOL_NO = ifelse(is.na(Avg_SCHOOL_NO), mean(Avg_SCHOOL_NO, na.rm = TRUE), Avg_SCHOOL_NO),
    Avg_SESSION_RANK = ifelse(is.na(Avg_SESSION_RANK), mean(Avg_SESSION_RANK, na.rm = TRUE), Avg_SESSION_RANK),
    Avg_ENGLISH_CATEGORY_RANK = ifelse(is.na(Avg_ENGLISH_CATEGORY_RANK), mean(Avg_ENGLISH_CATEGORY_RANK, na.rm = TRUE), Avg_ENGLISH_CATEGORY_RANK),
    Avg_FINANCE_TYPE_RANK = ifelse(is.na(Avg_FINANCE_TYPE_RANK), mean(Avg_FINANCE_TYPE_RANK, na.rm = TRUE), Avg_FINANCE_TYPE_RANK),
    Avg_SCHOOL_LEVEL_RANK = ifelse(is.na(Avg_SCHOOL_LEVEL_RANK), mean(Avg_SCHOOL_LEVEL_RANK, na.rm = TRUE), Avg_SCHOOL_LEVEL_RANK),
    Avg_TOTAL_RANK = ifelse(is.na(Avg_TOTAL_RANK), mean(Avg_TOTAL_RANK, na.rm = TRUE), Avg_TOTAL_RANK)
  )
# Function to get the mode
get_mode <- function(x) {
  uniq_x <- unique(x)
  uniq_x[which.max(tabulate(match(x, uniq_x)))]
}

# Impute missing values for categorical columns with the mode
merged_data <- merged_data %>%
  mutate(
    Most_Common_ENGLISH_CATEGORY = ifelse(is.na(Most_Common_ENGLISH_CATEGORY), get_mode(Most_Common_ENGLISH_CATEGORY), Most_Common_ENGLISH_CATEGORY),
    Most_Common_STUDENTS_GENDER = ifelse(is.na(Most_Common_STUDENTS_GENDER), get_mode(Most_Common_STUDENTS_GENDER), Most_Common_STUDENTS_GENDER),
    Most_Common_SESSION = ifelse(is.na(Most_Common_SESSION), get_mode(Most_Common_SESSION), Most_Common_SESSION),
    Most_Common_FINANCE_TYPE = ifelse(is.na(Most_Common_FINANCE_TYPE), get_mode(Most_Common_FINANCE_TYPE), Most_Common_FINANCE_TYPE),
    Most_Common_SCHOOL_LEVEL = ifelse(is.na(Most_Common_SCHOOL_LEVEL), get_mode(Most_Common_SCHOOL_LEVEL), Most_Common_SCHOOL_LEVEL)
  )
# Impute missing values for numerical columns with the mean
merged_data <- merged_data %>%
  mutate(
    Hong.Kong.Residents = ifelse(is.na(Hong.Kong.Residents), mean(Hong.Kong.Residents, na.rm = TRUE), Hong.Kong.Residents),
    Mainland.Visitors = ifelse(is.na(Mainland.Visitors), mean(Mainland.Visitors, na.rm = TRUE), Mainland.Visitors),
    Other.Visitors = ifelse(is.na(Other.Visitors), mean(Other.Visitors, na.rm = TRUE), Other.Visitors),
    Total = ifelse(is.na(Total), mean(Total, na.rm = TRUE), Total),
    Avg_SCHOOL_NO = ifelse(is.na(Avg_SCHOOL_NO), mean(Avg_SCHOOL_NO, na.rm = TRUE), Avg_SCHOOL_NO),
    Avg_SESSION_RANK = ifelse(is.na(Avg_SESSION_RANK), mean(Avg_SESSION_RANK, na.rm = TRUE), Avg_SESSION_RANK),
    Date = ifelse(is.na(Date), mean(Date, na.rm = TRUE), Date)  # If Date is numeric, else use mode for categorical
  )

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Date = ifelse(is.na(Date), mean(Date, na.rm = TRUE), Date)`.
## Caused by warning in `mean.default()`:
## ! argument is not numeric or logical: returning NA

# Function to get the mode (for categorical data)
get_mode <- function(x) {
  uniq_x <- unique(x)
  uniq_x[which.max(tabulate(match(x, uniq_x)))]
}

# Impute missing values for categorical columns with the mode
merged_data <- merged_data %>%
  mutate(
    REGION.x.x = ifelse(is.na(REGION.x.x), get_mode(REGION.x.x), REGION.x.x),
    REGION.y.y = ifelse(is.na(REGION.y.y), get_mode(REGION.y.y), REGION.y.y),
    Control.Point = ifelse(is.na(Control.Point), get_mode(Control.Point), Control.Point),
    Most_Common_ENGLISH_CATEGORY = ifelse(is.na(Most_Common_ENGLISH_CATEGORY), get_mode(Most_Common_ENGLISH_CATEGORY), Most_Common_ENGLISH_CATEGORY)
  )




colnames(merged_data) <- make.names(colnames(merged_data))  # Replaces spaces and symbols with dots

# Check data types
str(merged_data)

## 'data.frame':    360312 obs. of  45 variables:
##  $ District                     : chr  "SHA TIN" "YUEN LONG" "SAI KUNG" "TUEN MUN" ...
##  $ Total.population             : int  692806 668080 489037 506879 506879 506879 506879 506879 506879 506879 ...
##  $ Male                         : int  317927 314056 223094 240321 240321 240321 240321 240321 240321 240321 ...
##  $ Female                       : int  374879 354024 265943 266558 266558 266558 266558 266558 266558 266558 ...
##  $ X..15                        : int  78886 77199 56329 54136 54136 54136 54136 54136 54136 54136 ...
##  $ X15...24                     : int  53696 58988 39744 38535 38535 38535 38535 38535 38535 38535 ...
##  $ X25...44                     : int  201135 211376 150700 152609 152609 152609 152609 152609 152609 152609 ...
##  $ X45...64                     : int  220604 220151 164991 163644 163644 163644 163644 163644 163644 163644 ...
##  $ X65.                         : int  138485 100366 77273 97955 97955 97955 97955 97955 97955 97955 ...
##  $ Divorces                     : int  33651 36117 19976 29619 29619 29619 29619 29619 29619 29619 ...
##  $ English.speaking             : int  21042 16754 28502 9038 9038 9038 9038 9038 9038 9038 ...
##  $ No.schooling..Pre.primary    : int  23269 24183 14160 20637 20637 20637 20637 20637 20637 20637 ...
##  $ Post.secondary..Degree.course: int  154778 116916 128362 79104 79104 79104 79104 79104 79104 79104 ...
##  $ Study.in.the.same.district.3.: int  56374 36841 33449 49082 49082 49082 49082 49082 49082 49082 ...
##  $ Working.population           : int  340605 332587 262126 247419 247419 247419 247419 247419 247419 247419 ...
##  $ Employers                    : int  9333 8749 8206 5248 5248 5248 5248 5248 5248 5248 ...
##  $ REGION.x                     : chr  "NTE" "NTW" "NTE" "NTW" ...
##  $ Post_Secondary_Percentage    : num  22.3 17.5 26.2 15.6 15.6 ...
##  $ Rank_Score                   : num  204372 199559 157286 148458 148458 ...
##  $ Cluster                      : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ REGION.y                     : chr  "NTE" "NTW" "NTE" "NTW" ...
##  $ Total_Parking                : int  1587 957 1203 1171 1171 1171 1171 1171 1171 1171 ...
##  $ Year                         : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ Housing_Price                : num  75031 75031 75031 75031 75031 ...
##  $ REGION.x.x                   : chr  "NTE" "NTE" "NTE" "NTE" ...
##  $ Date                         : chr  NA NA NA "44197" ...
##  $ Control.Point                : chr  "Lo Wu" "Lo Wu" "Lo Wu" "Tuen Mun Ferry Terminal" ...
##  $ Arrival...Departure          : chr  NA NA NA "Arrival" ...
##  $ Hong.Kong.Residents          : num  13709 13709 13709 0 0 ...
##  $ Mainland.Visitors            : num  3690 3690 3690 0 0 ...
##  $ Other.Visitors               : num  138 138 138 0 0 ...
##  $ Total                        : num  17537 17537 17537 0 0 ...
##  $ REGION.y.y                   : chr  "NTE" "NTE" "NTE" "NTW" ...
##  $ Avg_SCHOOL_NO                : num  3.50e+11 4.14e+11 4.65e+11 3.31e+11 3.31e+11 ...
##  $ Most_Common_ENGLISH_CATEGORY : chr  "Kindergartens" "Kindergartens" "Kindergarten-cum-child Care Centres" "Kindergartens" ...
##  $ Avg_SESSION_RANK             : num  8.08 8 7.79 8.27 8.27 ...
##  $ Avg_ENGLISH_CATEGORY_RANK    : num  2.47 2.28 2.18 2.35 2.35 ...
##  $ Avg_FINANCE_TYPE_RANK        : num  6.1 6 5.85 6.09 6.09 ...
##  $ Avg_SCHOOL_LEVEL_RANK        : num  6.88 6.66 6.62 6.78 6.78 ...
##  $ Avg_TOTAL_RANK               : num  23.5 22.9 22.4 23.5 23.5 ...
##  $ Avg_RANK                     : num  822 873 896 828 828 ...
##  $ Most_Common_STUDENTS_GENDER  : chr  "CO-ED" "CO-ED" "CO-ED" "CO-ED" ...
##  $ Most_Common_SESSION          : chr  "WHOLE DAY" "WHOLE DAY" "WHOLE DAY" "WHOLE DAY" ...
##  $ Most_Common_FINANCE_TYPE     : chr  "PRIVATE" "PRIVATE" "PRIVATE" "PRIVATE" ...
##  $ Most_Common_SCHOOL_LEVEL     : chr  "KINDERGARTEN" "KINDERGARTEN" "KINDERGARTEN-CUM-CHILD CARE CENTRES" "KINDERGARTEN" ...

# Convert  columns
merged_data <- merged_data %>%
  mutate(
    Year = as.integer(Year),
    District = as.factor(District),
    Housing_Price = as.numeric(Housing_Price),
    Total_Parking = as.numeric(Total_Parking)
  )


print(head(merged_data))

##    District Total.population   Male Female X..15 X15...24 X25...44 X45...64   X65.
## 1   SHA TIN           692806 317927 374879 78886    53696   201135   220604 138485
## 2 YUEN LONG           668080 314056 354024 77199    58988   211376   220151 100366
## 3  SAI KUNG           489037 223094 265943 56329    39744   150700   164991  77273
## 4  TUEN MUN           506879 240321 266558 54136    38535   152609   163644  97955
## 5  TUEN MUN           506879 240321 266558 54136    38535   152609   163644  97955
## 6  TUEN MUN           506879 240321 266558 54136    38535   152609   163644  97955
##   Divorces English.speaking No.schooling..Pre.primary Post.secondary..Degree.course
## 1    33651            21042                     23269                        154778
## 2    36117            16754                     24183                        116916
## 3    19976            28502                     14160                        128362
## 4    29619             9038                     20637                         79104
## 5    29619             9038                     20637                         79104
## 6    29619             9038                     20637                         79104
##   Study.in.the.same.district.3. Working.population Employers REGION.x
## 1                         56374             340605      9333      NTE
## 2                         36841             332587      8749      NTW
## 3                         33449             262126      8206      NTE
## 4                         49082             247419      5248      NTW
## 5                         49082             247419      5248      NTW
## 6                         49082             247419      5248      NTW
##   Post_Secondary_Percentage Rank_Score Cluster REGION.y Total_Parking Year Housing_Price
## 1                  22.34074   204371.9       3      NTE          1587   NA       75030.6
## 2                  17.50030   199559.2       3      NTW           957   NA       75030.6
## 3                  26.24791   157286.1       3      NTE          1203   NA       75030.6
## 4                  15.60609   148457.6       3      NTW          1171   NA       75030.6
## 5                  15.60609   148457.6       3      NTW          1171   NA       75030.6
## 6                  15.60609   148457.6       3      NTW          1171   NA       75030.6
##   REGION.x.x  Date           Control.Point Arrival...Departure Hong.Kong.Residents
## 1        NTE  <NA>                   Lo Wu                <NA>            13709.48
## 2        NTE  <NA>                   Lo Wu                <NA>            13709.48
## 3        NTE  <NA>                   Lo Wu                <NA>            13709.48
## 4        NTE 44197 Tuen Mun Ferry Terminal             Arrival                0.00
## 5        NTE 44197 Tuen Mun Ferry Terminal           Departure                0.00
## 6        NTE 44228 Tuen Mun Ferry Terminal             Arrival                0.00
##   Mainland.Visitors Other.Visitors    Total REGION.y.y Avg_SCHOOL_NO
## 1           3689.72       137.6715 17536.87        NTE  350165730508
## 2           3689.72       137.6715 17536.87        NTE  414190289336
## 3           3689.72       137.6715 17536.87        NTE  464819423238
## 4              0.00         0.0000     0.00        NTW  330503680643
## 5              0.00         0.0000     0.00        NTW  330503680643
## 6              0.00         0.0000     0.00        NTW  330503680643
##          Most_Common_ENGLISH_CATEGORY Avg_SESSION_RANK Avg_ENGLISH_CATEGORY_RANK
## 1                       Kindergartens         8.075085                  2.467577
## 2                       Kindergartens         8.000000                  2.282230
## 3 Kindergarten-cum-child Care Centres         7.790598                  2.179487
## 4                       Kindergartens         8.269710                  2.348548
## 5                       Kindergartens         8.269710                  2.348548
## 6                       Kindergartens         8.269710                  2.348548
##   Avg_FINANCE_TYPE_RANK Avg_SCHOOL_LEVEL_RANK Avg_TOTAL_RANK Avg_RANK
## 1              6.095563              6.877133       23.51536 821.7440
## 2              6.003484              6.655052       22.94077 873.0767
## 3              5.850427              6.615385       22.43590 896.3590
## 4              6.087137              6.784232       23.48963 828.1079
## 5              6.087137              6.784232       23.48963 828.1079
## 6              6.087137              6.784232       23.48963 828.1079
##   Most_Common_STUDENTS_GENDER Most_Common_SESSION Most_Common_FINANCE_TYPE
## 1                       CO-ED           WHOLE DAY                  PRIVATE
## 2                       CO-ED           WHOLE DAY                  PRIVATE
## 3                       CO-ED           WHOLE DAY                  PRIVATE
## 4                       CO-ED           WHOLE DAY                  PRIVATE
## 5                       CO-ED           WHOLE DAY                  PRIVATE
## 6                       CO-ED           WHOLE DAY                  PRIVATE
##              Most_Common_SCHOOL_LEVEL
## 1                        KINDERGARTEN
## 2                        KINDERGARTEN
## 3 KINDERGARTEN-CUM-CHILD CARE CENTRES
## 4                        KINDERGARTEN
## 5                        KINDERGARTEN
## 6                        KINDERGARTEN

# Check missing values again
colSums(is.na(merged_data))

##                      District              Total.population                          Male 
##                             0                             0                             0 
##                        Female                         X..15                      X15...24 
##                             0                             0                             0 
##                      X25...44                      X45...64                          X65. 
##                             0                             0                             0 
##                      Divorces              English.speaking     No.schooling..Pre.primary 
##                             0                             0                             0 
## Post.secondary..Degree.course Study.in.the.same.district.3.            Working.population 
##                             0                             0                             0 
##                     Employers                      REGION.x     Post_Secondary_Percentage 
##                             0                             0                             0 
##                    Rank_Score                       Cluster                      REGION.y 
##                             0                             0                             0 
##                 Total_Parking                          Year                 Housing_Price 
##                             0                           812                             0 
##                    REGION.x.x                          Date                 Control.Point 
##                             0                             6                             0 
##           Arrival...Departure           Hong.Kong.Residents             Mainland.Visitors 
##                             6                             0                             0 
##                Other.Visitors                         Total                    REGION.y.y 
##                             0                             0                             0 
##                 Avg_SCHOOL_NO  Most_Common_ENGLISH_CATEGORY              Avg_SESSION_RANK 
##                             0                             0                             0 
##     Avg_ENGLISH_CATEGORY_RANK         Avg_FINANCE_TYPE_RANK         Avg_SCHOOL_LEVEL_RANK 
##                             0                             0                             0 
##                Avg_TOTAL_RANK                      Avg_RANK   Most_Common_STUDENTS_GENDER 
##                             0                             0                             0 
##           Most_Common_SESSION      Most_Common_FINANCE_TYPE      Most_Common_SCHOOL_LEVEL 
##                             0                             0                             0

#############EXPLORATORY DATA ANALYSIS
#Data Overview
str(merged_data)

## 'data.frame':    360312 obs. of  45 variables:
##  $ District                     : Factor w/ 8 levels "KWAI TSING","NORTH",..: 4 8 3 7 7 7 7 7 7 7 ...
##  $ Total.population             : int  692806 668080 489037 506879 506879 506879 506879 506879 506879 506879 ...
##  $ Male                         : int  317927 314056 223094 240321 240321 240321 240321 240321 240321 240321 ...
##  $ Female                       : int  374879 354024 265943 266558 266558 266558 266558 266558 266558 266558 ...
##  $ X..15                        : int  78886 77199 56329 54136 54136 54136 54136 54136 54136 54136 ...
##  $ X15...24                     : int  53696 58988 39744 38535 38535 38535 38535 38535 38535 38535 ...
##  $ X25...44                     : int  201135 211376 150700 152609 152609 152609 152609 152609 152609 152609 ...
##  $ X45...64                     : int  220604 220151 164991 163644 163644 163644 163644 163644 163644 163644 ...
##  $ X65.                         : int  138485 100366 77273 97955 97955 97955 97955 97955 97955 97955 ...
##  $ Divorces                     : int  33651 36117 19976 29619 29619 29619 29619 29619 29619 29619 ...
##  $ English.speaking             : int  21042 16754 28502 9038 9038 9038 9038 9038 9038 9038 ...
##  $ No.schooling..Pre.primary    : int  23269 24183 14160 20637 20637 20637 20637 20637 20637 20637 ...
##  $ Post.secondary..Degree.course: int  154778 116916 128362 79104 79104 79104 79104 79104 79104 79104 ...
##  $ Study.in.the.same.district.3.: int  56374 36841 33449 49082 49082 49082 49082 49082 49082 49082 ...
##  $ Working.population           : int  340605 332587 262126 247419 247419 247419 247419 247419 247419 247419 ...
##  $ Employers                    : int  9333 8749 8206 5248 5248 5248 5248 5248 5248 5248 ...
##  $ REGION.x                     : chr  "NTE" "NTW" "NTE" "NTW" ...
##  $ Post_Secondary_Percentage    : num  22.3 17.5 26.2 15.6 15.6 ...
##  $ Rank_Score                   : num  204372 199559 157286 148458 148458 ...
##  $ Cluster                      : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ REGION.y                     : chr  "NTE" "NTW" "NTE" "NTW" ...
##  $ Total_Parking                : num  1587 957 1203 1171 1171 ...
##  $ Year                         : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ Housing_Price                : num  75031 75031 75031 75031 75031 ...
##  $ REGION.x.x                   : chr  "NTE" "NTE" "NTE" "NTE" ...
##  $ Date                         : chr  NA NA NA "44197" ...
##  $ Control.Point                : chr  "Lo Wu" "Lo Wu" "Lo Wu" "Tuen Mun Ferry Terminal" ...
##  $ Arrival...Departure          : chr  NA NA NA "Arrival" ...
##  $ Hong.Kong.Residents          : num  13709 13709 13709 0 0 ...
##  $ Mainland.Visitors            : num  3690 3690 3690 0 0 ...
##  $ Other.Visitors               : num  138 138 138 0 0 ...
##  $ Total                        : num  17537 17537 17537 0 0 ...
##  $ REGION.y.y                   : chr  "NTE" "NTE" "NTE" "NTW" ...
##  $ Avg_SCHOOL_NO                : num  3.50e+11 4.14e+11 4.65e+11 3.31e+11 3.31e+11 ...
##  $ Most_Common_ENGLISH_CATEGORY : chr  "Kindergartens" "Kindergartens" "Kindergarten-cum-child Care Centres" "Kindergartens" ...
##  $ Avg_SESSION_RANK             : num  8.08 8 7.79 8.27 8.27 ...
##  $ Avg_ENGLISH_CATEGORY_RANK    : num  2.47 2.28 2.18 2.35 2.35 ...
##  $ Avg_FINANCE_TYPE_RANK        : num  6.1 6 5.85 6.09 6.09 ...
##  $ Avg_SCHOOL_LEVEL_RANK        : num  6.88 6.66 6.62 6.78 6.78 ...
##  $ Avg_TOTAL_RANK               : num  23.5 22.9 22.4 23.5 23.5 ...
##  $ Avg_RANK                     : num  822 873 896 828 828 ...
##  $ Most_Common_STUDENTS_GENDER  : chr  "CO-ED" "CO-ED" "CO-ED" "CO-ED" ...
##  $ Most_Common_SESSION          : chr  "WHOLE DAY" "WHOLE DAY" "WHOLE DAY" "WHOLE DAY" ...
##  $ Most_Common_FINANCE_TYPE     : chr  "PRIVATE" "PRIVATE" "PRIVATE" "PRIVATE" ...
##  $ Most_Common_SCHOOL_LEVEL     : chr  "KINDERGARTEN" "KINDERGARTEN" "KINDERGARTEN-CUM-CHILD CARE CENTRES" "KINDERGARTEN" ...

dim(merged_data)

## [1] 360312     45

#chcek missing
colSums(is.na(merged_data))

##                      District              Total.population                          Male 
##                             0                             0                             0 
##                        Female                         X..15                      X15...24 
##                             0                             0                             0 
##                      X25...44                      X45...64                          X65. 
##                             0                             0                             0 
##                      Divorces              English.speaking     No.schooling..Pre.primary 
##                             0                             0                             0 
## Post.secondary..Degree.course Study.in.the.same.district.3.            Working.population 
##                             0                             0                             0 
##                     Employers                      REGION.x     Post_Secondary_Percentage 
##                             0                             0                             0 
##                    Rank_Score                       Cluster                      REGION.y 
##                             0                             0                             0 
##                 Total_Parking                          Year                 Housing_Price 
##                             0                           812                             0 
##                    REGION.x.x                          Date                 Control.Point 
##                             0                             6                             0 
##           Arrival...Departure           Hong.Kong.Residents             Mainland.Visitors 
##                             6                             0                             0 
##                Other.Visitors                         Total                    REGION.y.y 
##                             0                             0                             0 
##                 Avg_SCHOOL_NO  Most_Common_ENGLISH_CATEGORY              Avg_SESSION_RANK 
##                             0                             0                             0 
##     Avg_ENGLISH_CATEGORY_RANK         Avg_FINANCE_TYPE_RANK         Avg_SCHOOL_LEVEL_RANK 
##                             0                             0                             0 
##                Avg_TOTAL_RANK                      Avg_RANK   Most_Common_STUDENTS_GENDER 
##                             0                             0                             0 
##           Most_Common_SESSION      Most_Common_FINANCE_TYPE      Most_Common_SCHOOL_LEVEL 
##                             0                             0                             0

#Summary statistics
summary(merged_data)

##        District      Total.population      Male            Female           X..15      
##  NORTH     :359500   Min.   :309631   Min.   :145343   Min.   :164288   Min.   :33499  
##  TUEN MUN  :   806   1st Qu.:309631   1st Qu.:145343   1st Qu.:164288   1st Qu.:33499  
##  KWAI TSING:     1   Median :309631   Median :145343   Median :164288   Median :33499  
##  SAI KUNG  :     1   Mean   :310075   Mean   :145557   Mean   :164518   Mean   :33546  
##  SHA TIN   :     1   3rd Qu.:309631   3rd Qu.:145343   3rd Qu.:164288   3rd Qu.:33499  
##  TAI PO    :     1   Max.   :692806   Max.   :317927   Max.   :374879   Max.   :78886  
##  (Other)   :     2                                                                     
##     X15...24        X25...44         X45...64           X65.           Divorces    
##  Min.   :23198   Min.   : 88314   Min.   :102810   Min.   : 55557   Min.   :13173  
##  1st Qu.:27654   1st Qu.: 88314   1st Qu.:104607   1st Qu.: 55557   1st Qu.:17410  
##  Median :27654   Median : 88314   Median :104607   Median : 55557   Median :17410  
##  Mean   :27679   Mean   : 88459   Mean   :104740   Mean   : 55652   Mean   :17437  
##  3rd Qu.:27654   3rd Qu.: 88314   3rd Qu.:104607   3rd Qu.: 55557   3rd Qu.:17410  
##  Max.   :58988   Max.   :211376   Max.   :220604   Max.   :138485   Max.   :36117  
##                                                                                    
##  English.speaking No.schooling..Pre.primary Post.secondary..Degree.course
##  Min.   : 4330    Min.   :10535             Min.   : 48482               
##  1st Qu.: 4330    1st Qu.:11971             1st Qu.: 48482               
##  Median : 4330    Median :11971             Median : 48482               
##  Mean   : 4341    Mean   :11990             Mean   : 48551               
##  3rd Qu.: 4330    3rd Qu.:11971             3rd Qu.: 48482               
##  Max.   :28502    Max.   :27431             Max.   :154778               
##                                                                          
##  Study.in.the.same.district.3. Working.population   Employers      REGION.x        
##  Min.   :22365                 Min.   :146416     Min.   :2622   Length:360312     
##  1st Qu.:23134                 1st Qu.:146416     1st Qu.:2622   Class :character  
##  Median :23134                 Median :146416     Median :2622   Mode  :character  
##  Mean   :23192                 Mean   :146644     Mean   :2628                     
##  3rd Qu.:23134                 3rd Qu.:146416     3rd Qu.:2622                     
##  Max.   :56374                 Max.   :340605     Max.   :9333                     
##                                                                                    
##  Post_Secondary_Percentage   Rank_Score        Cluster        REGION.y        
##  Min.   :14.78             Min.   : 87856   Min.   :1.000   Length:360312     
##  1st Qu.:15.66             1st Qu.: 87856   1st Qu.:2.000   Class :character  
##  Median :15.66             Median : 87856   Median :2.000   Mode  :character  
##  Mean   :15.66             Mean   : 87992   Mean   :2.002                     
##  3rd Qu.:15.66             3rd Qu.: 87856   3rd Qu.:2.000                     
##  Max.   :26.25             Max.   :204372   Max.   :3.000                     
##                                                                               
##  Total_Parking       Year      Housing_Price     REGION.x.x            Date          
##  Min.   : 506   Min.   :1999   Min.   : 20843   Length:360312      Length:360312     
##  1st Qu.:1097   1st Qu.:2005   1st Qu.: 31444   Class :character   Class :character  
##  Median :1097   Median :2011   Median : 58093   Mode  :character   Mode  :character  
##  Mean   :1097   Mean   :2011   Mean   : 75031                                        
##  3rd Qu.:1097   3rd Qu.:2017   3rd Qu.:121633                                        
##  Max.   :1587   Max.   :2023   Max.   :152765                                        
##                 NA's   :812                                                          
##  Control.Point      Arrival...Departure Hong.Kong.Residents Mainland.Visitors
##  Length:360312      Length:360312       Min.   :     0      Min.   :    0    
##  Class :character   Class :character    1st Qu.:     0      1st Qu.:    0    
##  Mode  :character   Mode  :character    Median :     0      Median :    0    
##                                         Mean   : 13709      Mean   : 3690    
##                                         3rd Qu.: 19174      3rd Qu.: 4452    
##                                         Max.   :155071      Max.   :53608    
##                                                                              
##  Other.Visitors       Total         REGION.y.y        Avg_SCHOOL_NO      
##  Min.   :   0.0   Min.   :     0   Length:360312      Min.   :3.305e+11  
##  1st Qu.:   0.0   1st Qu.:     0   Class :character   1st Qu.:3.575e+11  
##  Median :   0.0   Median :     0   Mode  :character   Median :3.575e+11  
##  Mean   : 137.7   Mean   : 17537                      Mean   :3.574e+11  
##  3rd Qu.: 259.0   3rd Qu.: 24286                      3rd Qu.:3.575e+11  
##  Max.   :1263.0   Max.   :168435                      Max.   :4.648e+11  
##                                                                          
##  Most_Common_ENGLISH_CATEGORY Avg_SESSION_RANK Avg_ENGLISH_CATEGORY_RANK
##  Length:360312                Min.   :7.791    Min.   :2.176            
##  Class :character             1st Qu.:8.113    1st Qu.:2.310            
##  Mode  :character             Median :8.113    Median :2.310            
##                               Mean   :8.113    Mean   :2.310            
##                               3rd Qu.:8.113    3rd Qu.:2.310            
##                               Max.   :8.317    Max.   :2.655            
##                                                                         
##  Avg_FINANCE_TYPE_RANK Avg_SCHOOL_LEVEL_RANK Avg_TOTAL_RANK     Avg_RANK    
##  Min.   :5.850         Min.   :6.571         Min.   :22.44   Min.   :785.0  
##  1st Qu.:6.006         1st Qu.:6.571         1st Qu.:23.00   1st Qu.:866.4  
##  Median :6.006         Median :6.571         Median :23.00   Median :866.4  
##  Mean   :6.006         Mean   :6.572         Mean   :23.00   Mean   :866.3  
##  3rd Qu.:6.006         3rd Qu.:6.571         3rd Qu.:23.00   3rd Qu.:866.4  
##  Max.   :6.096         Max.   :6.952         Max.   :23.96   Max.   :896.4  
##                                                                             
##  Most_Common_STUDENTS_GENDER Most_Common_SESSION Most_Common_FINANCE_TYPE
##  Length:360312               Length:360312       Length:360312           
##  Class :character            Class :character    Class :character        
##  Mode  :character            Mode  :character    Mode  :character        
##                                                                          
##                                                                          
##                                                                          
##                                                                          
##  Most_Common_SCHOOL_LEVEL
##  Length:360312           
##  Class :character        
##  Mode  :character        
##                          
##                          
##                          
##

#Univariate analysis
table(merged_data$REGION.y,merged_data$REGION.x)

##      
##          NTE    NTW
##   NTE 359503      0
##   NTW      0    809

barplot(table(merged_data$REGION.y,merged_data$REGION.x))

hist(merged_data$`Total.population`, main = "Total Population Distribution", xlab = "Total Population")

boxplot(merged_data$`Housing_Price`, main = "Housing Price Distribution", ylab = "Price")

#Outlier detection
boxplot(merged_data$Housing_Price, main = "Boxplot for Housing Prices")

#Bivariate Analysis
#Numerical vs numerical
plot(merged_data$`Housing_Price`, merged_data$`Total.population`, 
     main = "Housing Price vs Total Population", 
     xlab = "Housing Price", ylab = "Total Population")

cor(merged_data$Housing_Price, merged_data$`Total.population`, use = "complete.obs")

## [1] 6.673583e-15

#correlation between housing price and totalpopulation=0.4719672

#Categorical vs. Numerical:

#Boxplot of housing prices for each region
boxplot(Housing_Price ~ REGION.x, data = merged_data, 
        main = "Housing Prices by Region", 
        xlab = "Region", ylab = "Housing Price")

# columns that are "constant" (have only one unique value excluding NA)
constant_cols <- sapply(merged_data, function(col) length(unique(na.omit(col))) == 1)

# Drop these constant columns
merged_data <- merged_data[, !constant_cols]

#Multivariate Analysis
#Heatmap of correlations between numerical variables:
library(ggcorrplot)
num_data <- merged_data %>% select_if(is.numeric)
constant_cols <- sapply(num_data, function(x) sd(x, na.rm = TRUE) == 0)
constant_cols

##              Total.population                          Male                        Female 
##                         FALSE                         FALSE                         FALSE 
##                         X..15                      X15...24                      X25...44 
##                         FALSE                         FALSE                         FALSE 
##                      X45...64                          X65.                      Divorces 
##                         FALSE                         FALSE                         FALSE 
##              English.speaking     No.schooling..Pre.primary Post.secondary..Degree.course 
##                         FALSE                         FALSE                         FALSE 
## Study.in.the.same.district.3.            Working.population                     Employers 
##                         FALSE                         FALSE                         FALSE 
##     Post_Secondary_Percentage                    Rank_Score                       Cluster 
##                         FALSE                         FALSE                         FALSE 
##                 Total_Parking                          Year                 Housing_Price 
##                         FALSE                         FALSE                         FALSE 
##           Hong.Kong.Residents             Mainland.Visitors                Other.Visitors 
##                         FALSE                         FALSE                         FALSE 
##                         Total                 Avg_SCHOOL_NO              Avg_SESSION_RANK 
##                         FALSE                         FALSE                         FALSE 
##     Avg_ENGLISH_CATEGORY_RANK         Avg_FINANCE_TYPE_RANK         Avg_SCHOOL_LEVEL_RANK 
##                         FALSE                         FALSE                         FALSE 
##                Avg_TOTAL_RANK                      Avg_RANK 
##                         FALSE                         FALSE

num_data <- num_data[, !constant_cols]
corr_matrix <- cor(num_data, use = "complete.obs")

## Warning in cor(num_data, use = "complete.obs"): the standard deviation is zero

ggcorrplot(corr_matrix, lab = TRUE)

#Principal Component Analysis (PCA) for dimensionality reduction and pattern discovery:
library(FactoMineR)
pca_results <- PCA(merged_data %>% select_if(is.numeric), graph = TRUE)

## Warning in PCA(merged_data %>% select_if(is.numeric), graph = TRUE): Missing values are
## imputed by the mean of the variable: you should use the imputePCA function of the missMDA
## package

## Warning: ggrepel: 24 unlabeled data points (too many overlaps). Consider increasing
## max.overlaps

colnames(merged_data)

##  [1] "District"                      "Total.population"             
##  [3] "Male"                          "Female"                       
##  [5] "X..15"                         "X15...24"                     
##  [7] "X25...44"                      "X45...64"                     
##  [9] "X65."                          "Divorces"                     
## [11] "English.speaking"              "No.schooling..Pre.primary"    
## [13] "Post.secondary..Degree.course" "Study.in.the.same.district.3."
## [15] "Working.population"            "Employers"                    
## [17] "REGION.x"                      "Post_Secondary_Percentage"    
## [19] "Rank_Score"                    "Cluster"                      
## [21] "REGION.y"                      "Total_Parking"                
## [23] "Year"                          "Housing_Price"                
## [25] "Date"                          "Control.Point"                
## [27] "Arrival...Departure"           "Hong.Kong.Residents"          
## [29] "Mainland.Visitors"             "Other.Visitors"               
## [31] "Total"                         "REGION.y.y"                   
## [33] "Avg_SCHOOL_NO"                 "Most_Common_ENGLISH_CATEGORY" 
## [35] "Avg_SESSION_RANK"              "Avg_ENGLISH_CATEGORY_RANK"    
## [37] "Avg_FINANCE_TYPE_RANK"         "Avg_SCHOOL_LEVEL_RANK"        
## [39] "Avg_TOTAL_RANK"                "Avg_RANK"                     
## [41] "Most_Common_SCHOOL_LEVEL"

#Key Insights and Visualization

ggplot(merged_data, aes(x = Housing_Price)) +
  geom_histogram(binwidth = 5000, fill = "steelblue", color = "white") +
  labs(title = "Distribution of Housing Prices", x = "Housing_Price", y = "Frequency")

#Interactive plot#s
library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

#plot_ly(merged_data, x = ~Housing_Price, y = ~Total.population, type = 'scatter', mode = 'markers')

#Correlation analysis
# Calculate Pearson's correlation for relevant variables

# Calculate Pearson's correlation for relevant numeric variables
correlation_matrix <- cor(merged_data[c("Housing_Price", "Total.population", "Total", 
                                        "Total_Parking", "Post.secondary..Degree.course", 
                                        "Rank_Score", "Working.population", "Divorces", 
                                        "Hong.Kong.Residents", "Mainland.Visitors", 
                                        "Other.Visitors", "Avg_SCHOOL_NO", 
                                        "Avg_SESSION_RANK", "Avg_ENGLISH_CATEGORY_RANK", 
                                        "Avg_FINANCE_TYPE_RANK", "Avg_SCHOOL_LEVEL_RANK", 
                                        "Avg_TOTAL_RANK", "Avg_RANK")], 
                          use = "complete.obs")

# View the correlation matrix
print(correlation_matrix)

##                               Housing_Price Total.population         Total Total_Parking
## Housing_Price                  1.000000e+00     6.673583e-15  2.717929e-15 -3.735370e-14
## Total.population               6.673583e-15     1.000000e+00 -2.793183e-02  8.962105e-01
## Total                          2.717929e-15    -2.793183e-02  1.000000e+00 -2.521344e-02
## Total_Parking                 -3.735370e-14     8.962105e-01 -2.521344e-02  1.000000e+00
## Post.secondary..Degree.course -4.406838e-14     9.961201e-01 -2.764660e-02  8.964682e-01
## Rank_Score                     1.354517e-14     9.999503e-01 -2.792485e-02  8.961605e-01
## Working.population            -5.169219e-15     9.999503e-01 -2.792486e-02  8.961606e-01
## Divorces                      -7.159718e-14     9.992776e-01 -2.799620e-02  8.964774e-01
## Hong.Kong.Residents            1.251160e-14    -2.755736e-02  9.944558e-01 -2.487541e-02
## Mainland.Visitors             -3.351274e-15    -2.649244e-02  9.266113e-01 -2.391414e-02
## Other.Visitors                -9.362999e-15    -2.875393e-02  8.482762e-01 -2.595553e-02
## Avg_SCHOOL_NO                 -5.964421e-15    -9.736710e-01  2.771496e-02 -8.902775e-01
## Avg_SESSION_RANK               4.761092e-14     9.845755e-01 -2.792219e-02  9.077867e-01
## Avg_ENGLISH_CATEGORY_RANK      3.059107e-14     9.270314e-01 -2.615259e-02  9.082421e-01
## Avg_FINANCE_TYPE_RANK         -2.455677e-14     9.916551e-01 -2.798013e-02  9.066681e-01
## Avg_SCHOOL_LEVEL_RANK          3.031967e-14     9.958140e-01 -2.798463e-02  9.075818e-01
## Avg_TOTAL_RANK                 5.649427e-15     9.915495e-01 -2.797322e-02  9.129204e-01
## Avg_RANK                       2.182079e-14    -9.917905e-01  2.796856e-02 -9.159251e-01
##                               Post.secondary..Degree.course    Rank_Score
## Housing_Price                                 -4.406838e-14  1.354517e-14
## Total.population                               9.961201e-01  9.999503e-01
## Total                                         -2.764660e-02 -2.792485e-02
## Total_Parking                                  8.964682e-01  8.961605e-01
## Post.secondary..Degree.course                  1.000000e+00  9.967163e-01
## Rank_Score                                     9.967163e-01  1.000000e+00
## Working.population                             9.967160e-01  1.000000e+00
## Divorces                                       9.921101e-01  9.989595e-01
## Hong.Kong.Residents                           -2.727595e-02 -2.755048e-02
## Mainland.Visitors                             -2.622191e-02 -2.648582e-02
## Other.Visitors                                -2.846030e-02 -2.874675e-02
## Avg_SCHOOL_NO                                 -9.549889e-01 -9.721736e-01
## Avg_SESSION_RANK                               9.686359e-01  9.835895e-01
## Avg_ENGLISH_CATEGORY_RANK                      9.263968e-01  9.271543e-01
## Avg_FINANCE_TYPE_RANK                          9.788458e-01  9.906988e-01
## Avg_SCHOOL_LEVEL_RANK                          9.907283e-01  9.958380e-01
## Avg_TOTAL_RANK                                 9.820547e-01  9.910959e-01
## Avg_RANK                                      -9.834338e-01 -9.914752e-01
##                               Working.population      Divorces Hong.Kong.Residents
## Housing_Price                      -5.169219e-15 -7.159718e-14        1.251160e-14
## Total.population                    9.999503e-01  9.992776e-01       -2.755736e-02
## Total                              -2.792486e-02 -2.799620e-02        9.944558e-01
## Total_Parking                       8.961606e-01  8.964774e-01       -2.487541e-02
## Post.secondary..Degree.course       9.967160e-01  9.921101e-01       -2.727595e-02
## Rank_Score                          1.000000e+00  9.989595e-01       -2.755048e-02
## Working.population                  1.000000e+00  9.989597e-01       -2.755048e-02
## Divorces                            9.989597e-01  1.000000e+00       -2.762087e-02
## Hong.Kong.Residents                -2.755048e-02 -2.762087e-02        1.000000e+00
## Mainland.Visitors                  -2.648583e-02 -2.655350e-02        8.819800e-01
## Other.Visitors                     -2.874675e-02 -2.882020e-02        8.434681e-01
## Avg_SCHOOL_NO                      -9.721742e-01 -9.797933e-01        2.734340e-02
## Avg_SESSION_RANK                    9.835900e-01  9.894538e-01       -2.754785e-02
## Avg_ENGLISH_CATEGORY_RANK           9.271541e-01  9.260159e-01       -2.580198e-02
## Avg_FINANCE_TYPE_RANK               9.906992e-01  9.950895e-01       -2.760501e-02
## Avg_SCHOOL_LEVEL_RANK               9.958380e-01  9.958970e-01       -2.760946e-02
## Avg_TOTAL_RANK                      9.910961e-01  9.936316e-01       -2.759820e-02
## Avg_RANK                           -9.914754e-01 -9.934287e-01        2.759360e-02
##                               Mainland.Visitors Other.Visitors Avg_SCHOOL_NO
## Housing_Price                     -3.351274e-15  -9.362999e-15 -5.964421e-15
## Total.population                  -2.649244e-02  -2.875393e-02 -9.736710e-01
## Total                              9.266113e-01   8.482762e-01  2.771496e-02
## Total_Parking                     -2.391414e-02  -2.595553e-02 -8.902775e-01
## Post.secondary..Degree.course     -2.622191e-02  -2.846030e-02 -9.549889e-01
## Rank_Score                        -2.648582e-02  -2.874675e-02 -9.721736e-01
## Working.population                -2.648583e-02  -2.874675e-02 -9.721742e-01
## Divorces                          -2.655350e-02  -2.882020e-02 -9.797933e-01
## Hong.Kong.Residents                8.819800e-01   8.434681e-01  2.734340e-02
## Mainland.Visitors                  1.000000e+00   7.767572e-01  2.628675e-02
## Other.Visitors                     7.767572e-01   1.000000e+00  2.853068e-02
## Avg_SCHOOL_NO                      2.628675e-02   2.853068e-02  1.000000e+00
## Avg_SESSION_RANK                  -2.648330e-02  -2.874401e-02 -9.955158e-01
## Avg_ENGLISH_CATEGORY_RANK         -2.480490e-02  -2.692233e-02 -9.423869e-01
## Avg_FINANCE_TYPE_RANK             -2.653825e-02  -2.880365e-02 -9.941441e-01
## Avg_SCHOOL_LEVEL_RANK             -2.654253e-02  -2.880829e-02 -9.834260e-01
## Avg_TOTAL_RANK                    -2.653170e-02  -2.879654e-02 -9.913951e-01
## Avg_RANK                           2.652728e-02   2.879174e-02  9.896727e-01
##                               Avg_SESSION_RANK Avg_ENGLISH_CATEGORY_RANK
## Housing_Price                     4.761092e-14              3.059107e-14
## Total.population                  9.845755e-01              9.270314e-01
## Total                            -2.792219e-02             -2.615259e-02
## Total_Parking                     9.077867e-01              9.082421e-01
## Post.secondary..Degree.course     9.686359e-01              9.263968e-01
## Rank_Score                        9.835895e-01              9.271543e-01
## Working.population                9.835900e-01              9.271541e-01
## Divorces                          9.894538e-01              9.260159e-01
## Hong.Kong.Residents              -2.754785e-02             -2.580198e-02
## Mainland.Visitors                -2.648330e-02             -2.480490e-02
## Other.Visitors                   -2.874401e-02             -2.692233e-02
## Avg_SCHOOL_NO                    -9.955158e-01             -9.423869e-01
## Avg_SESSION_RANK                  1.000000e+00              9.413553e-01
## Avg_ENGLISH_CATEGORY_RANK         9.413553e-01              1.000000e+00
## Avg_FINANCE_TYPE_RANK             9.976824e-01              9.402989e-01
## Avg_SCHOOL_LEVEL_RANK             9.912731e-01              9.527621e-01
## Avg_TOTAL_RANK                    9.967397e-01              9.567032e-01
## Avg_RANK                         -9.957855e-01             -9.581160e-01
##                               Avg_FINANCE_TYPE_RANK Avg_SCHOOL_LEVEL_RANK Avg_TOTAL_RANK
## Housing_Price                         -2.455677e-14          3.031967e-14   5.649427e-15
## Total.population                       9.916551e-01          9.958140e-01   9.915495e-01
## Total                                 -2.798013e-02         -2.798463e-02  -2.797322e-02
## Total_Parking                          9.066681e-01          9.075818e-01   9.129204e-01
## Post.secondary..Degree.course          9.788458e-01          9.907283e-01   9.820547e-01
## Rank_Score                             9.906988e-01          9.958380e-01   9.910959e-01
## Working.population                     9.906992e-01          9.958380e-01   9.910961e-01
## Divorces                               9.950895e-01          9.958970e-01   9.936316e-01
## Hong.Kong.Residents                   -2.760501e-02         -2.760946e-02  -2.759820e-02
## Mainland.Visitors                     -2.653825e-02         -2.654253e-02  -2.653170e-02
## Other.Visitors                        -2.880365e-02         -2.880829e-02  -2.879654e-02
## Avg_SCHOOL_NO                         -9.941441e-01         -9.834260e-01  -9.913951e-01
## Avg_SESSION_RANK                       9.976824e-01          9.912731e-01   9.967397e-01
## Avg_ENGLISH_CATEGORY_RANK              9.402989e-01          9.527621e-01   9.567032e-01
## Avg_FINANCE_TYPE_RANK                  1.000000e+00          9.944607e-01   9.976760e-01
## Avg_SCHOOL_LEVEL_RANK                  9.944607e-01          1.000000e+00   9.981730e-01
## Avg_TOTAL_RANK                         9.976760e-01          9.981730e-01   1.000000e+00
## Avg_RANK                              -9.967574e-01         -9.986619e-01  -9.998748e-01
##                                    Avg_RANK
## Housing_Price                  2.182079e-14
## Total.population              -9.917905e-01
## Total                          2.796856e-02
## Total_Parking                 -9.159251e-01
## Post.secondary..Degree.course -9.834338e-01
## Rank_Score                    -9.914752e-01
## Working.population            -9.914754e-01
## Divorces                      -9.934287e-01
## Hong.Kong.Residents            2.759360e-02
## Mainland.Visitors              2.652728e-02
## Other.Visitors                 2.879174e-02
## Avg_SCHOOL_NO                  9.896727e-01
## Avg_SESSION_RANK              -9.957855e-01
## Avg_ENGLISH_CATEGORY_RANK     -9.581160e-01
## Avg_FINANCE_TYPE_RANK         -9.967574e-01
## Avg_SCHOOL_LEVEL_RANK         -9.986619e-01
## Avg_TOTAL_RANK                -9.998748e-01
## Avg_RANK                       1.000000e+00

#Visualize the correlation matrix

library(reshape2)

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

library(ggplot2)

# Reshape the correlation matrix for ggplot
correlation_melt <- melt(correlation_matrix)
ggplot(correlation_melt, aes(Var1, Var2, fill = value)) +
  geom_tile() + 
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0) + 
  theme_minimal() + 
  labs(title = "Correlation Heatmap", x = "", y = "")

#Statistical Tests
#t-test
#comparing house prices between New Territories East (NTE) and New Territories West (NTW).
# Perform t-test for house prices between NTE and NTW
colnames(Housing_long)

## [1] "Year"          "District"      "Housing_Price" "REGION"

t_test_result <- t.test(Housing_Price ~REGION.x, data = merged_data)
print(t_test_result)

## 
##  Welch Two Sample t-test
## 
## data:  Housing_Price by REGION.x
## t = -2.4357e-09, df = 359502, p-value = 1
## alternative hypothesis: true difference in means between group NTE and group NTW is not equal to 0
## 95 percent confidence interval:
##  -154.2993  154.2993
## sample estimates:
## mean in group NTE mean in group NTW 
##           75030.6           75030.6

#If the p-value is less than 0.05, it suggests that the means between the two regions are significantly different

#ANOVA
# ANOVA to compare house prices across different districts
anova_result <- aov(Housing_Price ~ District, data = merged_data)
summary(anova_result)

##                 Df   Sum Sq   Mean Sq F value Pr(>F)
## District         7 0.00e+00 0.000e+00       0      1
## Residuals   360304 8.01e+14 2.223e+09

# ANOVA table
anova_table <- anova(anova_result)
print(anova_table)

## Analysis of Variance Table
## 
## Response: Housing_Price
##               Df   Sum Sq    Mean Sq F value Pr(>F)
## District       7 0.00e+00          0       0      1
## Residuals 360304 8.01e+14 2223122122

colnames(merged_data)

##  [1] "District"                      "Total.population"             
##  [3] "Male"                          "Female"                       
##  [5] "X..15"                         "X15...24"                     
##  [7] "X25...44"                      "X45...64"                     
##  [9] "X65."                          "Divorces"                     
## [11] "English.speaking"              "No.schooling..Pre.primary"    
## [13] "Post.secondary..Degree.course" "Study.in.the.same.district.3."
## [15] "Working.population"            "Employers"                    
## [17] "REGION.x"                      "Post_Secondary_Percentage"    
## [19] "Rank_Score"                    "Cluster"                      
## [21] "REGION.y"                      "Total_Parking"                
## [23] "Year"                          "Housing_Price"                
## [25] "Date"                          "Control.Point"                
## [27] "Arrival...Departure"           "Hong.Kong.Residents"          
## [29] "Mainland.Visitors"             "Other.Visitors"               
## [31] "Total"                         "REGION.y.y"                   
## [33] "Avg_SCHOOL_NO"                 "Most_Common_ENGLISH_CATEGORY" 
## [35] "Avg_SESSION_RANK"              "Avg_ENGLISH_CATEGORY_RANK"    
## [37] "Avg_FINANCE_TYPE_RANK"         "Avg_SCHOOL_LEVEL_RANK"        
## [39] "Avg_TOTAL_RANK"                "Avg_RANK"                     
## [41] "Most_Common_SCHOOL_LEVEL"

##Linear regression
# Fit a linear regression model
# Fit a linear regression model 
model <- lm(Housing_Price ~ Hong.Kong.Residents + Mainland.Visitors + Avg_SCHOOL_LEVEL_RANK, data = merged_data)
summary(model)

## 
## Call:
## lm(formula = Housing_Price ~ Hong.Kong.Residents + Mainland.Visitors + 
##     Avg_SCHOOL_LEVEL_RANK, data = merged_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -54188 -43587 -16938  46602  77734 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)
## (Intercept)            7.503e+04  5.119e+04   1.466    0.143
## Hong.Kong.Residents    1.080e-14  7.114e-03   0.000    1.000
## Mainland.Visitors     -6.512e-15  2.541e-02   0.000    1.000
## Avg_SCHOOL_LEVEL_RANK  1.658e-07  7.788e+03   0.000    1.000
## 
## Residual standard error: 47150 on 360308 degrees of freedom
## Multiple R-squared:  5.856e-24,  Adjusted R-squared:  -8.326e-06 
## F-statistic: 7.033e-19 on 3 and 360308 DF,  p-value: 1

# Diagnostic plots for the linear regression model
par(mfrow = c(2, 2))  # Set up a 2x2 plot grid
plot(model)

#Chi-Square Test for Categorical Data
# Perform Chi-square test for Cluster vs REGION
chi_test <- chisq.test(table(merged_data$Cluster, merged_data$REGION.x))

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be incorrect

print(chi_test)

## 
##  Pearson's Chi-squared test
## 
## data:  table(merged_data$Cluster, merged_data$REGION.x)
## X-squared = 359198, df = 2, p-value < 2.2e-16

#Normalization to 3NF:
colnames(merged_data)

##  [1] "District"                      "Total.population"             
##  [3] "Male"                          "Female"                       
##  [5] "X..15"                         "X15...24"                     
##  [7] "X25...44"                      "X45...64"                     
##  [9] "X65."                          "Divorces"                     
## [11] "English.speaking"              "No.schooling..Pre.primary"    
## [13] "Post.secondary..Degree.course" "Study.in.the.same.district.3."
## [15] "Working.population"            "Employers"                    
## [17] "REGION.x"                      "Post_Secondary_Percentage"    
## [19] "Rank_Score"                    "Cluster"                      
## [21] "REGION.y"                      "Total_Parking"                
## [23] "Year"                          "Housing_Price"                
## [25] "Date"                          "Control.Point"                
## [27] "Arrival...Departure"           "Hong.Kong.Residents"          
## [29] "Mainland.Visitors"             "Other.Visitors"               
## [31] "Total"                         "REGION.y.y"                   
## [33] "Avg_SCHOOL_NO"                 "Most_Common_ENGLISH_CATEGORY" 
## [35] "Avg_SESSION_RANK"              "Avg_ENGLISH_CATEGORY_RANK"    
## [37] "Avg_FINANCE_TYPE_RANK"         "Avg_SCHOOL_LEVEL_RANK"        
## [39] "Avg_TOTAL_RANK"                "Avg_RANK"                     
## [41] "Most_Common_SCHOOL_LEVEL"

library(dplyr)
#Structuring the Data into Separate Tables (for 2NF):
# Demographic Data Table (District as Primary Key)
district_data <- merged_data %>%
  select(District, Total.population, Male, Female, Divorces)

# Housing Data Table (District as Foreign Key)
housing_data <- merged_data %>%
  select(District, Housing_Price, Total_Parking)

# School Data Table (District as Foreign Key)
school_data <- merged_data %>%
  select(District, Avg_SCHOOL_LEVEL_RANK, Avg_TOTAL_RANK)

# Traffic Data Table (District as Foreign Key)
traffic_data <- merged_data %>%
  select(District, Hong.Kong.Residents, Mainland.Visitors)

###Create the Scoring System:
# Min-Max scaling function to scale values between 0 and 10
normalize_score <- function(x) {
  return (10 * (x - min(x)) / (max(x) - min(x)))
}

# Score each characteristic
merged_data$Housing_Price_Score <- normalize_score(merged_data$Housing_Price)
merged_data$Avg_School_Rank_Score <- 10 - normalize_score(merged_data$Avg_SCHOOL_LEVEL_RANK)  # Higher rank gets higher score
merged_data$Total_Parking_Score <- normalize_score(merged_data$Total_Parking)
merged_data$Post_Secondary_Percentage_Score <- normalize_score(merged_data$Post_Secondary_Percentage)
merged_data$Divorces_Score <- normalize_score(merged_data$Divorces)
merged_data$Total_Residents_Score <- normalize_score(merged_data$Hong.Kong.Residents)
merged_data$Total_Visitors_Score <- normalize_score(merged_data$Mainland.Visitors)

# Combine scores for each district
merged_data$Total_Score <- rowSums(merged_data[c("Housing_Price_Score", "Avg_School_Rank_Score", "Total_Parking_Score", 
                                                 "Post_Secondary_Percentage_Score", "Divorces_Score", "Total_Residents_Score", 
                                                 "Total_Visitors_Score")])

# Normalize the total score (if necessary)
merged_data$Total_Score <- normalize_score(merged_data$Total_Score)
# Sort districts by Total_Score and display the top 3
top_districts <- merged_data %>%
  arrange(desc(Total_Score)) %>%
  head(3)

# Display the top 3 districts along with their REGION (NTE or NTW)
top_districts <- top_districts[, c("District", "Total_Score", "REGION.x")]

# Visualization of the top 3 districts, including region
ggplot(top_districts, aes(x = reorder(District, Total_Score), y = Total_Score, fill = REGION.x)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = c("NTE" = "lightgreen", "NTW" = "lightblue")) + # Custom colors for NTE and NTW
  theme_minimal() +
  labs(title = "Top 3 Districts by Total Score", x = "District", y = "Total Score", fill = "Region") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Print the result
print(top_districts)

##   District Total_Score REGION.x
## 1    NORTH   10.000000      NTE
## 2    NORTH    9.932890      NTE
## 3    NORTH    9.906578      NTE