title: “Recommendation sys” output: pdf_document —
#Libraries
library(sf)
library(ggplot2)
library(tmap)
library(dplyr)
library(dplyr)
library(janitor)
library(lubridate)
library(tidyr)
library(htmltools)
#NEW TERRITORIES MAP
#hongkong shapefile
hk_districts <- st_read("/Users/apple/Downloads/Hong_Kong_18_Districts_3574690825663895789")
## Reading layer `HKDistrict18' from data source
## `/Users/apple/Downloads/Hong_Kong_18_Districts_3574690825663895789' using driver `ESRI Shapefile'
## Simple feature collection with 18 features and 4 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 12672060 ymin: 2529946 xmax: 12739620 ymax: 2579129
## Projected CRS: WGS 84 / Pseudo-Mercator
#map of all hongkong districts
ggplot(data = hk_districts) +
geom_sf(aes(fill =ENAME)) +
labs(title = "Map of all Districts in Hong Kong",
fill = "District") +
theme_minimal()
# Districts in NTE and NTW regions
nte_districts <- c("SHA TIN", "TAI PO", "NORTH", "SAI KUNG")
ntw_districts <- c("KWAI TSING", "TSUEN WAN", "TUEN MUN", "YUEN LONG")
# Subsetting the districts in NTE and NTW
nte_districts_data <- hk_districts[hk_districts$ENAME %in% nte_districts, ]
ntw_districts_data <- hk_districts[hk_districts$ENAME %in% ntw_districts, ]
# Assigning colors to the NTE and NTW districts
nte_colors <- c("#4CAF50", "#2196F3", "#00BCD4", "#FF9800")
ntw_colors <- c("#673AB7", "#E91E63", "#FFC107", "#FFC107")
nte_districts_data$color <- nte_colors
ntw_districts_data$color <- ntw_colors
# New territory map
ggplot() +
geom_sf(data = nte_districts_data, aes(fill = color)) +
geom_sf(data = ntw_districts_data, aes(fill = color)) +
geom_sf_text(data = nte_districts_data, aes(label = ENAME), size = 3, color = "black", fontface = "bold") +
geom_sf_text(data = ntw_districts_data, aes(label = ENAME), size = 3, color = "black", fontface = "bold") +
scale_fill_identity() +
labs(title = "Map of NTE and NTW Districts in Hong Kong") +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5),
axis.text = element_blank(),
axis.ticks = element_blank()
)
#DATA CLEANING
#Check duplicates
#Duplicate Rows in Each Dataset
#install.packages("openxlsx")
library(openxlsx)
Traffic_data<- read.xlsx("/Users/apple/Downloads/Traffic.xlsx")
Housing_prices <- read.csv("/Users/apple/Downloads/Housing prices.csv")
School_data <- read.xlsx("/Users/apple/Downloads/School data.xlsx")
Census_data <- read.csv("/Users/apple/Downloads/Census and Social information.csv")
Housing_parking <- read.csv("/Users/apple/Downloads/Housing parking.csv")
sum(duplicated(Housing_prices)) # Count of duplicate rows in Housing_prices
## [1] 0
sum(duplicated(School_data)) # Count of duplicate rows in School_data
## [1] 0
sum(duplicated(Census_data)) # Count of duplicate rows in Census_data
## [1] 0
sum(duplicated(Traffic_data)) # Count of duplicate rows in Traffic_data
## [1] 0
#There are no duplicates```{r}
#MISSING VALUES
# Check for missing values and count them in each dataset
sum(is.na(Housing_prices))
## [1] 0
sum(is.na(School_data))
## [1] 0
sum(is.na(Census_data))
## [1] 0
sum(is.na(Traffic_data))
## [1] 0
#There are no missing values
#Normalizing categorical variables
#SCHOOL DATA
#Assign SESSION_RANK Based on Session Type
School_data <- School_data %>%
mutate(SESSION_RANK = case_when(
SESSION == "WHOLE DAY" ~ 10,
SESSION == "P.M." ~ 7,
SESSION == "A.M." ~ 4,
SESSION == "EVENING" ~ 2,
TRUE ~ 0 # In case there are any unexpected values
))
# View the updated SESSION_RANK column
table(School_data$SESSION_RANK)
##
## 2 4 7 10
## 14 779 731 2037
#English category rank of schools
#Assign RANK Based on English Category
School_data <- School_data %>%
mutate(ENGLISH_CATEGORY_RANK = case_when(
`ENGLISH.CATEGORY` %in% c("International Schools (Primary)", "International Schools (Secondary)") ~ 10,
`ENGLISH.CATEGORY` %in% c("English Schools Foundation (Primary)", "English Schools Foundation (Secondary)") ~ 9,
`ENGLISH.CATEGORY` == "Government Secondary Schools" ~ 8,
`ENGLISH.CATEGORY` == "Aided Secondary Schools" ~ 7,
`ENGLISH.CATEGORY` == "Direct Subsidy Scheme Secondary Schools" ~ 6,
`ENGLISH.CATEGORY` == "Private Secondary Schools (Day/Evening)" ~ 5,
`ENGLISH.CATEGORY` == "Aided Primary Schools" ~ 4,
`ENGLISH.CATEGORY` == "Government Primary Schools" ~ 3,
`ENGLISH.CATEGORY` %in% c("Kindergartens", "Kindergarten-cum-child Care Centres") ~ 1,
TRUE ~ 0 # In case there are any unexpected values
))
head(School_data)
## SCHOOL.NO. ENGLISH.CATEGORY
## 1 115509000123 Aided Primary Schools
## 2 536547000223 Aided Primary Schools
## 3 113000000123 Aided Primary Schools
## 4 113417000223 Aided Primary Schools
## 5 512176000223 Aided Primary Schools
## 6 512176000123 Aided Primary Schools
## ENGLISH.NAME
## 1 SHA TIN METHODIST PRIMARY SCHOOL
## 2 TUNG CHUNG CATHOLIC SCHOOL
## 3 TUNG KOON SCHOOL
## 4 THE CHURCH OF CHRIST IN CHINA CHUEN YUEN FIRST PRIMARY SCHOOL
## 5 HOP YAT CHURCH SCHOOL
## 6 HOP YAT CHURCH SCHOOL
## ENGLISH.ADDRESS
## 1 KWONG YUEN ESTATE, SHATIN, NEW TERRITORIES
## 2 8 YAT TUNG STREET YAT TUNG (2) ESTATE TUNG CHUNG LANTAU ISLAND NEW TERRITORIES
## 3 JOCKEY CLUB ROAD, SHEUNG SHUI, NEW TERRITORIES
## 4 36 WING SHUN STREET, TSUEN WAN, NEW TERRITORIES (EXCLUDING 2/F VISUAL ARTS ROOM)
## 5 6 PERTH STREET HOMANTIN KOWLOON
## 6 7 PRINCESS MARGARET ROAD HOMANTIN KOWLOON
## LONGITUDE LATITUDE EASTING NORTHING STUDENTS.GENDER SESSION
## 1 114.21463726 22.38018009 840157.0 826775.0 CO-ED WHOLE DAY
## 2 113.93545348 22.28267348 811388.0 815997.0 CO-ED WHOLE DAY
## 3 114.1310215 22.50282065 831550.0 840356.0 CO-ED WHOLE DAY
## 4 114.11502938 22.36323017 829898.139065 824898.86077027 CO-ED WHOLE DAY
## 5 114.17944646 22.32079952 836533.0 820199.0 CO-ED WHOLE DAY
## 6 114.1793591 22.32051055 836524.0 820167.0 CO-ED WHOLE DAY
## DISTRICT FINANCE.TYPE SCHOOL.LEVEL TELEPHONE FAX.NUMBER WEBSITE
## 1 SHA TIN AIDED PRIMARY 26366533 26494329 http://www.smps.edu.hk
## 2 ISLANDS AIDED PRIMARY 21094962 21210166 http://www.tccs.edu.hk
## 3 NORTH AIDED PRIMARY 26700334 26684311 http://www.tks.edu.hk
## 4 TSUEN WAN AIDED PRIMARY 24900336 24158627 N.A.
## 5 KOWLOON CITY AIDED PRIMARY 27111013 27142465 http://www.hycs.edu.hk
## 6 KOWLOON CITY AIDED PRIMARY 27111013 27142465 http://www.hycs.edu.hk
## RELIGION SESSION_RANK ENGLISH_CATEGORY_RANK
## 1 PROTESTANTISM / CHRISTIANITY 10 4
## 2 CATHOLICISM 10 4
## 3 NOT APPLICABLE 10 4
## 4 PROTESTANTISM / CHRISTIANITY 10 4
## 5 PROTESTANTISM / CHRISTIANITY 10 4
## 6 PROTESTANTISM / CHRISTIANITY 10 4
#ENGLISH_CATEGORY_RANK column
table(School_data$ENGLISH_CATEGORY_RANK)
##
## 0 1 3 4 5 6 7 8 9 10
## 259 2248 33 428 49 59 365 30 18 72
#Rank based on finance type
#Assign RANK Based on Finance Type ---
School_data <- School_data %>%
mutate(FINANCE_TYPE_RANK = case_when(
`FINANCE.TYPE` == "ENGLISH SCHOOLS FOUNDATION" ~ 10,
`FINANCE.TYPE` == "DIRECT SUBSIDY SCHEME" ~ 9,
`FINANCE.TYPE` == "AIDED" ~ 8,
`FINANCE.TYPE` == "GOVERNMENT" ~ 7,
`FINANCE.TYPE` == "PRIVATE" ~ 5,
`FINANCE.TYPE` == "PRIVATE INDEPENDENT SCH SCHEME" ~ 3,
`FINANCE.TYPE` == "CAPUT" ~ 1,
TRUE ~ 0
))
# FINANCE_TYPE_RANK column
table(School_data$FINANCE_TYPE_RANK)
##
## 1 3 5 7 8 9 10
## 2 15 2426 63 955 82 18
# Assigning RANK Based on School Level
School_data <- School_data %>%
mutate(SCHOOL_LEVEL_RANK = case_when(
`SCHOOL.LEVEL` == "SECONDARY" ~ 10,
`SCHOOL.LEVEL` == "PRIMARY" ~ 8,
`SCHOOL.LEVEL` == "KINDERGARTEN-CUM-CHILD CARE CENTRES" ~ 6,
`SCHOOL.LEVEL` == "KINDERGARTEN" ~ 5,
TRUE ~ 0
))
# SCHOOL_LEVEL_RANK column
table(School_data$SCHOOL_LEVEL_RANK)
##
## 5 6 8 10
## 1253 995 692 621
#districts for NTE and NTW
nte_ntw_districts <- c("SAI KUNG", "SHA TIN", "TAI PO", "NORTH", "YUEN LONG", "KWAI TSING", "TSUEN WAN", "TUEN MUN")
# Filtering School_data to keep only rows where DISTRICT is in NTE or NTW
School_data <- School_data %>%
filter(DISTRICT %in% nte_ntw_districts)
# Checking the filtered data
table(School_data$DISTRICT)
##
## KWAI TSING NORTH SAI KUNG SHA TIN TAI PO TSUEN WAN TUEN MUN YUEN LONG
## 222 168 234 293 145 116 241 287
#Column for region
nte_districts <- c("SAI KUNG", "SHA TIN", "TAI PO", "NORTH
")
ntw_districts <- c("YUEN LONG", "KWAI TSING", "TSUEN WAN", "TUEN MUN")
School_data <- School_data %>%
mutate(REGION = case_when(
DISTRICT %in% nte_districts ~ "NTE", # New Territories East
DISTRICT %in% ntw_districts ~ "NTW", # New Territories West
TRUE ~ "Other"
))
#School rank
# Combining all the individual ranks to generate a total score for each school
colnames(School_data)
## [1] "SCHOOL.NO." "ENGLISH.CATEGORY" "ENGLISH.NAME"
## [4] "ENGLISH.ADDRESS" "LONGITUDE" "LATITUDE"
## [7] "EASTING" "NORTHING" "STUDENTS.GENDER"
## [10] "SESSION" "DISTRICT" "FINANCE.TYPE"
## [13] "SCHOOL.LEVEL" "TELEPHONE" "FAX.NUMBER"
## [16] "WEBSITE" "RELIGION" "SESSION_RANK"
## [19] "ENGLISH_CATEGORY_RANK" "FINANCE_TYPE_RANK" "SCHOOL_LEVEL_RANK"
## [22] "REGION"
School_data <- School_data %>%
mutate(
TOTAL_RANK = SESSION_RANK + ENGLISH_CATEGORY_RANK + FINANCE_TYPE_RANK + SCHOOL_LEVEL_RANK
)
# Ranking the schools based on the total rank
School_data <- School_data %>%
arrange(desc(TOTAL_RANK)) %>%
mutate(RANK = row_number())
#top 10 schools based on the rank
top_schools <- School_data %>%
select(`ENGLISH.NAME`, DISTRICT, REGION, RANK) %>%
slice_head(n = 50)
# View the top 3 schools
head(top_schools, n = 50)
## ENGLISH.NAME
## 1 SHATIN COLLEGE
## 2 SHATIN COLLEGE
## 3 CLEARWATER BAY SCHOOL
## 4 SHATIN JUNIOR SCHOOL
## 5 CLEARWATER BAY SCHOOL
## 6 CARITAS FANLING CHAN CHUN HA SECONDARY SCHOOL
## 7 BAPTIST LUI MING CHOI SECONDARY SCHOOL
## 8 CHRISTIAN ALLIANCE S W CHAN MEMORIAL COLLEGE
## 9 TEXTILE INSTITUTE AMERICAN CHAMBER OF COMMERCE WOO HON FAI SECONDARY SCHOOL
## 10 BUDDHIST SUM HEUNG LAM MEMORIAL COLLEGE
## 11 PO LEUNG KUK C.W. CHU COLLEGE
## 12 CHRISTIAN & MISSIONARY ALLIANCE SUN KEI SECONDARY SCHOOL
## 13 FUNG KAI NO.1 SECONDARY SCHOOL
## 14 CARMEL BUNNAN TONG MEMORIAL SECONDARY SCHOOL
## 15 DAUGHTERS OF MARY HELP OF CHRISTIANS SIU MING CATHOLIC SECONDARY SCHOOL
## 16 LIONS CLUBS INTERNATIONAL TSEUNG CHUI KING COLLEGE
## 17 SHA TIN METHODIST COLLEGE
## 18 KWOK TAK SENG CATHOLIC SECONDARY SCHOOL
## 19 SHATIN PUI YING COLLEGE
## 20 CHENG CHEK CHEE SECONDARY SCHOOL OF SAI KUNG AND HANG HAU DISTRICT, N.T.
## 21 IMMACULATE HEART OF MARY COLLEGE
## 22 SHEK LEI CATHOLIC SECONDARY SCHOOL
## 23 LINGNAN DR. CHUNG WING KWONG MEMORIAL SECONDARY SCHOOL
## 24 TUNG WAH GROUP OF HOSPITALS LO KON TING MEMORIAL COLLEGE
## 25 SHENG KUNG HUI BISHOP BAKER SECONDARY SCHOOL
## 26 MA KAM MING CHARITABLE FOUNDATION MA CHAN DUEN HEY MEMORIAL COLLEGE
## 27 KWAI CHUNG METHODIST COLLEGE
## 28 YAN CHAI HOSPITAL TUNG CHI YING MEMORIAL SECONDARY SCHOOL
## 29 THE CHURCH OF CHRIST IN CHINA FONG YUN WAH SECONDARY SCHOOL
## 30 CARMEL DIVINE GRACE FOUNDATION SECONDARY SCHOOL
## 31 CATHOLIC MING YUEN SECONDARY SCHOOL
## 32 CUMBERLAND PRESBYTERIAN CHURCH YAO DAO SECONDARY SCHOOL
## 33 SHENG KUNG HUI TSANG SHIU TIM SECONDARY SCHOOL
## 34 YUEN LONG PUBLIC MIDDLE SCHOOL ALUMNI ASSOCIATION TANG SIU TONG SECONDARY SCHOOL
## 35 ASSEMBLY OF GOD HEBRON SECONDARY SCHOOL
## 36 LOCK TAO SECONDARY SCHOOL
## 37 BUDDHIST WONG WAN TIN COLLEGE
## 38 PUI SHING CATHOLIC SECONDARY SCHOOL
## 39 HONG KONG TAOIST ASSOCIATION THE YUEN YUEN INSTITUTE NO.2 SECONDARY SCHOOL
## 40 BETHEL HIGH SCHOOL
## 41 SHENG KUNG HUI LAM KAU MOW SECONDARY SCHOOL
## 42 VALTORTA COLLEGE
## 43 PO LEUNG KUK WU CHUNG COLLEGE
## 44 KAU YAN COLLEGE
## 45 POK OI HOSPITAL CHAN KAI MEMORIAL COLLEGE
## 46 THE CHURCH OF CHRIST IN CHINA KEI YUEN COLLEGE
## 47 HONG KONG TAOIST ASSOCIATION TANG HIN MEMORIAL SECONDARY SCHOOL
## 48 CHRISTIAN ALLIANCE CHENG WING GEE COLLEGE OF THE KOWLOON TONG CHURCH OF THE CHINESE CHRISTIAN AND MISSIONARY ALLIANCE, HONG KONG
## 49 TUNG WAH GROUP OF HOSPITALS YOW KAM YUEN COLLEGE
## 50 TOI SHAN ASSOCIATION COLLEGE
## DISTRICT REGION RANK
## 1 SHA TIN NTE 1
## 2 SHA TIN NTE 2
## 3 SAI KUNG NTE 3
## 4 SHA TIN NTE 4
## 5 SAI KUNG NTE 5
## 6 NORTH Other 6
## 7 SHA TIN NTE 7
## 8 NORTH Other 8
## 9 TSUEN WAN NTW 9
## 10 TUEN MUN NTW 10
## 11 SHA TIN NTE 11
## 12 SAI KUNG NTE 12
## 13 NORTH Other 13
## 14 TUEN MUN NTW 14
## 15 KWAI TSING NTW 15
## 16 KWAI TSING NTW 16
## 17 SHA TIN NTE 17
## 18 SHA TIN NTE 18
## 19 SHA TIN NTE 19
## 20 SAI KUNG NTE 20
## 21 SHA TIN NTE 21
## 22 KWAI TSING NTW 22
## 23 KWAI TSING NTW 23
## 24 YUEN LONG NTW 24
## 25 YUEN LONG NTW 25
## 26 SAI KUNG NTE 26
## 27 KWAI TSING NTW 27
## 28 SHA TIN NTE 28
## 29 YUEN LONG NTW 29
## 30 SAI KUNG NTE 30
## 31 SAI KUNG NTE 31
## 32 YUEN LONG NTW 32
## 33 SHA TIN NTE 33
## 34 YUEN LONG NTW 34
## 35 TAI PO NTE 35
## 36 SHA TIN NTE 36
## 37 SHA TIN NTE 37
## 38 YUEN LONG NTW 38
## 39 TAI PO NTE 39
## 40 YUEN LONG NTW 40
## 41 SHA TIN NTE 41
## 42 TAI PO NTE 42
## 43 SHA TIN NTE 43
## 44 TAI PO NTE 44
## 45 SHA TIN NTE 45
## 46 YUEN LONG NTW 46
## 47 NORTH Other 47
## 48 SHA TIN NTE 48
## 49 SHA TIN NTE 49
## 50 SHA TIN NTE 50
##Housing parking
colnames(Housing_parking)
## [1] "District"
## [2] "Location"
## [3] "Types.of.Operating.Hours"
## [4] "For.Vehicles.Other.Than.Medium.and.Heavy.Goods.Vehicles..Buses..Motor.Cycles.and.Pedal.Cycles."
## [5] "For.Goods.Vehicles"
## [6] "For.Coaches"
nte_districts <- c("SAI KUNG", "SHA TIN", "TAI PO", "NORTH")
ntw_districts <- c("YUEN LONG", "KWAI TSING", "TSUEN WAN", "TUEN MUN")
Housing_parking <- Housing_parking %>%
mutate(
Total_Parking = `For.Vehicles.Other.Than.Medium.and.Heavy.Goods.Vehicles..Buses..Motor.Cycles.and.Pedal.Cycles.` +
`For.Goods.Vehicles` +
`For.Coaches`, # Calculate total parking spaces
REGION = case_when(
District %in% nte_districts ~ "NTE",
District %in% ntw_districts ~ "NTW",
TRUE ~ "Other" # Assign "Other" for districts not in NTE or NTW
)
) %>%
group_by(District, REGION) %>% # Group by District and REGION
summarise(
Total_Parking = sum(Total_Parking, na.rm = TRUE), # Sum parking spaces by district and region
.groups = "drop" # Remove grouping after summarization
) %>%
arrange(REGION, desc(Total_Parking)) # Sort by REGION and then by total parking
# View the result
print(Housing_parking)
## # A tibble: 8 × 3
## District REGION Total_Parking
## <chr> <chr> <int>
## 1 SHA TIN NTE 1587
## 2 TAI PO NTE 1403
## 3 SAI KUNG NTE 1203
## 4 NORTH NTE 1097
## 5 TUEN MUN NTW 1171
## 6 YUEN LONG NTW 957
## 7 KWAI TSING NTW 509
## 8 TSUEN WAN NTW 506
####################Traffic data
colnames(Traffic_data)
## [1] "Date" "District" "Control.Point"
## [4] "Arrival./.Departure" "Hong.Kong.Residents" "Mainland.Visitors"
## [7] "Other.Visitors" "Total"
nte_districts <- c("SAI KUNG", "SHA TIN", "TAI PO", "NORTH")
ntw_districts <- c("YUEN LONG", "KWAI TSING", "TSUEN WAN", "TUEN MUN")
# Add REGION column based on districts
Traffic <- Traffic_data %>%
mutate(REGION = case_when(
District %in% nte_districts ~ "NTE",
District %in% ntw_districts ~ "NTW",
TRUE ~ "Other"
))
# Summarize total traffic (including all visitors) by District and Control Point
Traffic_summary <- Traffic %>%
group_by(District, "Control Point", REGION) %>%
summarise(
Total_HongKong_Residents = sum(`Hong.Kong.Residents`, na.rm = TRUE),
Total_Mainland_Visitors = sum(`Mainland.Visitors`, na.rm = TRUE),
Total_Other_Visitors = sum(`Other.Visitors`, na.rm = TRUE),
Total_Traffic = sum(Total, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(REGION, desc(Total_Traffic))
# Summarize total traffic (arrivals and departures) by District
Traffic_summary <- Traffic %>%
mutate(REGION = case_when(
District %in% nte_districts ~ "NTE",
District %in% ntw_districts ~ "NTW",
TRUE ~ "Other"
)) %>%
group_by(District, REGION) %>%
summarise(
Total_HongKong_Residents_Arrivals = sum(`Hong.Kong.Residents`[`Arrival./.Departure` == "Arrival"], na.rm = TRUE),
Total_HongKong_Residents_Departures = sum(`Hong.Kong.Residents`[`Arrival./.Departure` == "Departure"], na.rm = TRUE),
Total_Mainland_Visitors_Arrivals = sum(`Mainland.Visitors`[`Arrival./.Departure` == "Arrival"], na.rm = TRUE),
Total_Mainland_Visitors_Departures = sum(`Mainland.Visitors`[`Arrival./.Departure` == "Departure"], na.rm = TRUE),
Total_Other_Visitors_Arrivals = sum(`Other.Visitors`[`Arrival./.Departure` == "Arrival"], na.rm = TRUE),
Total_Other_Visitors_Departures = sum(`Other.Visitors`[`Arrival./.Departure` == "Departure"], na.rm = TRUE),
Total_Arrivals = sum(Total[`Arrival./.Departure` == "Arrival"], na.rm = TRUE),
Total_Departures = sum(Total[`Arrival./.Departure` == "Departure"], na.rm = TRUE),
Total_Traffic = sum(Total, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(REGION, desc(Total_Traffic)) # Sort by REGION and then by total traffic
# View the result
print(Traffic_summary)
## # A tibble: 2 × 11
## District REGION Total_HongKong_Residents…¹ Total_HongKong_Resid…² Total_Mainland_Visit…³
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 NORTH NTE 98817538 98766738 26783604
## 2 TUEN MUN NTW 0 0 0
## # ℹ abbreviated names: ¹Total_HongKong_Residents_Arrivals,
## # ²Total_HongKong_Residents_Departures, ³Total_Mainland_Visitors_Arrivals
## # ℹ 6 more variables: Total_Mainland_Visitors_Departures <dbl>,
## # Total_Other_Visitors_Arrivals <dbl>, Total_Other_Visitors_Departures <dbl>,
## # Total_Arrivals <dbl>, Total_Departures <dbl>, Total_Traffic <dbl>
###############
###############Housing prices
# Reshape the dataset from wide to long format
Housing_long <- Housing_prices %>%
pivot_longer(cols = `KWAI.TSING`:`YUEN.LONG`,
names_to = "District",
values_to = "Housing_Price") %>%
arrange(Year, District)
# View the reshaped dataset
print(Housing_long)
## # A tibble: 200 × 3
## Year District Housing_Price
## <int> <chr> <int>
## 1 1999 KWAI.TSING 41861
## 2 1999 NORTH 35735
## 3 1999 SAI.KUNG 49287
## 4 1999 SHA.TIN 35471
## 5 1999 TAI.PO 37095
## 6 1999 TSUEN.WAN 35042
## 7 1999 TUEN.MUN 60751
## 8 1999 YUEN.LONG 43952
## 9 2000 KWAI.TSING 35975
## 10 2000 NORTH 31444
## # ℹ 190 more rows
# Define NTE and NTW districts
colnames(Housing_long)
## [1] "Year" "District" "Housing_Price"
nte_districts <- c("SAI KUNG", "SHA TIN", "TAI PO", "NORTH")
ntw_districts <- c("YUEN LONG", "KWAI TSING", "TSUEN WAN", "TUEN MUN")
# Add a column for REGION
Housing_long <- Housing_long %>%
mutate(REGION = case_when(
District %in% nte_districts ~ "NTE",
District %in% ntw_districts ~ "NTW",
TRUE ~ "Other"
))
ggplot(Housing_long, aes(x = Year, y = Housing_Price, color = REGION, group = District)) +
geom_line(size = 1.2) + # Bold lines
geom_text(data = Housing_long %>%
group_by(District) %>%
filter(Year == max(Year)), # Filter to get last year for each district
aes(label = District),
vjust = -0.5, size = 3, check_overlap = TRUE) +
labs(title = "Housing Price Trends by District (1999-2023)",
x = "Year",
y = "Housing Price (HKD)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
# Define the districts for NTE and NTW
nte_districts <- c("SAI KUNG", "SHA TIN", "TAI PO", "NORTH")
ntw_districts <- c("YUEN LONG", "KWAI TSING", "TSUEN WAN", "TUEN MUN")
# average housing price for each district, region, and rank by price
Housing_summary <- Housing_prices %>%
pivot_longer(cols = starts_with("KWAI.TSING"):starts_with("YUEN.LONG"),
names_to = "District",
values_to = "Housing_Price") %>%
group_by(District) %>%
summarise(Average_Housing_Price = mean(Housing_Price, na.rm = TRUE)) %>%
mutate(REGION = case_when(
District %in% nte_districts ~ "NTE",
District %in% ntw_districts ~ "NTW",
TRUE ~ "Other"
)) %>%
arrange(Average_Housing_Price) # Sorting by Average_Housing_Price (ascending)
# View the result
print(Housing_summary)
## # A tibble: 8 × 3
## District Average_Housing_Price REGION
## <chr> <dbl> <chr>
## 1 TSUEN.WAN 69170. Other
## 2 NORTH 75031. NTE
## 3 SHA.TIN 81318 Other
## 4 TAI.PO 86091. Other
## 5 KWAI.TSING 99885. Other
## 6 SAI.KUNG 104439. Other
## 7 YUEN.LONG 110206. Other
## 8 TUEN.MUN 127430. Other
##################Census and social information
# Define the districts for NTE and NTW in capital letters
nte_districts <- c("SAI KUNG", "SHA TIN", "TAI PO", "NORTH")
ntw_districts <- c("YUEN LONG", "KWAI TSING", "TSUEN WAN", "TUEN MUN")
colnames(Census_data)
## [1] "DISTRICT" "Total.population"
## [3] "Male" "Female"
## [5] "X..15" "X15...24"
## [7] "X25...44" "X45...64"
## [9] "X65." "Divorces"
## [11] "English.speaking" "No.schooling..Pre.primary"
## [13] "Post.secondary..Degree.course" "Study.in.the.same.district.3."
## [15] "Working.population" "Employers"
# Add the REGION column to Census_data, considering the capitalized 'DISTRICT' column name
Census_data <- Census_data %>%
mutate(REGION = case_when(
`DISTRICT` %in% nte_districts ~ "NTE",
`DISTRICT` %in% ntw_districts ~ "NTW",
TRUE ~ "Other"
))
#Friendly neighbourhood is the one with many children
# age distribution visualization
# age group data to compare across districts
Census_data %>%
gather(key = "Age_Group", value = "Population", "X..15", "X15...24", "X25...44", "X45...64", "X65.") %>%
ggplot(aes(x = DISTRICT, y = Population, fill = Age_Group)) +
geom_bar(stat = "identity", position = "stack") +
labs(title = "Age Group Distribution by District", x = "District", y = "Population")
#Education
# percentage of population with post-secondary education
Census_data %>%
mutate(Post_Secondary_Percentage = (`Post.secondary..Degree.course` / `Total.population`) * 100) %>%
ggplot(aes(x = DISTRICT, y = Post_Secondary_Percentage, fill = REGION)) +
geom_bar(stat = "identity") +
labs(title = "Post-Secondary Education Percentage by District", x = "District", y = "Percentage (%)")
#working population and employers
# Plot of working population and employers by district
ggplot(Census_data, aes(x = DISTRICT, y = `Working.population`, fill = REGION)) +
geom_bar(stat = "identity") +
labs(title = "Working Population by District", x = "District", y = "Working Population")
# social dynamics
# Plot divorce rates by district
ggplot(Census_data, aes(x = DISTRICT, y = Divorces, fill = REGION)) +
geom_bar(stat = "identity") +
labs(title = "Divorce Rates by District", x = "District", y = "Divorces")
Census_data <- Census_data %>%
mutate(
Post_Secondary_Percentage = (`Post.secondary..Degree.course` / `Total.population`) * 100,
Rank_Score = (`Working.population` * 0.6) + (Post_Secondary_Percentage * 0.4)
) %>%
arrange(desc(Rank_Score))
# Compare the number of working population in NTE and NTW
ggplot(Census_data, aes(x = REGION, y = `Working.population`, fill = REGION)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Working Population by Region (NTE vs NTW)", x = "Region", y = "Working Population")
####Cluster analysis
# Select relevant columns for clustering
clustering_data <- Census_data %>%
select(`Total.population`, `Post.secondary..Degree.course`, `Working.population`, Divorces)
# Normalize data for clustering
clustering_data_scaled <- scale(clustering_data)
# Perform k-means clustering (you can experiment with k)
set.seed(123)
kmeans_result <- kmeans(clustering_data_scaled, centers = 3)
# Add cluster to the original data
Census_data$Cluster <- kmeans_result$cluster
# Visualize clustering
ggplot(Census_data, aes(x = `Working.population`, y = `Post.secondary..Degree.course`, color = as.factor(Cluster))) +
geom_point() +
labs(title = "Cluster Analysis of Districts", x = "Working Population", y = "Post-Secondary Education")
#MERGE DATASET
# Standardizing column names for consistency
colnames(Census_data)[colnames(Census_data) == "DISTRICT"] <- "District"
colnames(Housing_parking)[colnames(Housing_parking) == "District"] <- "District"
colnames(Housing_long)[colnames(Housing_long) == "District"] <- "District"
colnames(Traffic)[colnames(Traffic) == "District"] <- "District"
colnames(School_data)[colnames(School_data) == "DISTRICT"] <- "District"
colnames(Census_data)[colnames(Census_data) == "District"] <- "District"
# Function to calculate mode
get_mode <- function(x) {
ux <- unique(na.omit(x))
ux[which.max(tabulate(match(x, ux)))]
}
# Aggregating School_data by District
School_data_agg <- School_data %>%
select(-c(ENGLISH.NAME, ENGLISH.ADDRESS, LONGITUDE, LATITUDE, EASTING, NORTHING, TELEPHONE, FAX.NUMBER, WEBSITE, RELIGION)) %>%
group_by(District) %>%
summarize(
Avg_SCHOOL_NO = mean(as.numeric(SCHOOL.NO.), na.rm = TRUE),
Most_Common_ENGLISH_CATEGORY = get_mode(ENGLISH.CATEGORY),
Avg_SESSION_RANK = mean(as.numeric(SESSION_RANK), na.rm = TRUE),
Avg_ENGLISH_CATEGORY_RANK = mean(as.numeric(ENGLISH_CATEGORY_RANK), na.rm = TRUE),
Avg_FINANCE_TYPE_RANK = mean(as.numeric(FINANCE_TYPE_RANK), na.rm = TRUE),
Avg_SCHOOL_LEVEL_RANK = mean(as.numeric(SCHOOL_LEVEL_RANK), na.rm = TRUE),
Avg_TOTAL_RANK = mean(as.numeric(TOTAL_RANK), na.rm = TRUE),
Avg_RANK = mean(as.numeric(RANK), na.rm = TRUE),
Most_Common_STUDENTS_GENDER = get_mode(STUDENTS.GENDER),
Most_Common_SESSION = get_mode(SESSION),
Most_Common_FINANCE_TYPE = get_mode(FINANCE.TYPE),
Most_Common_SCHOOL_LEVEL = get_mode(SCHOOL.LEVEL)
)
# View the first few rows of the aggregated School_data
head(School_data_agg)
## # A tibble: 6 × 13
## District Avg_SCHOOL_NO Most_Common_ENGLISH_C…¹ Avg_SESSION_RANK Avg_ENGLISH_CATEGORY…²
## <chr> <dbl> <chr> <dbl> <dbl>
## 1 KWAI TSING 344182346973. Kindergartens 8.15 2.18
## 2 NORTH 357500863242. Kindergartens 8.11 2.31
## 3 SAI KUNG 464819423238. Kindergarten-cum-child… 7.79 2.18
## 4 SHA TIN 350165730508. Kindergartens 8.08 2.47
## 5 TAI PO 334602400139. Kindergartens 8.32 2.66
## 6 TSUEN WAN 371665396702. Kindergarten-cum-child… 7.85 2.40
## # ℹ abbreviated names: ¹Most_Common_ENGLISH_CATEGORY, ²Avg_ENGLISH_CATEGORY_RANK
## # ℹ 8 more variables: Avg_FINANCE_TYPE_RANK <dbl>, Avg_SCHOOL_LEVEL_RANK <dbl>,
## # Avg_TOTAL_RANK <dbl>, Avg_RANK <dbl>, Most_Common_STUDENTS_GENDER <chr>,
## # Most_Common_SESSION <chr>, Most_Common_FINANCE_TYPE <chr>,
## # Most_Common_SCHOOL_LEVEL <chr>
# Merge step-by-step
merged_data <- Census_data %>%
left_join(Housing_parking, by = "District")
gc() # Clear memory
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 3889291 207.8 6434728 343.7 NA 6434728 343.7
## Vcells 6944985 53.0 14786712 112.9 16384 10146326 77.5
merged_data <- merged_data %>%
left_join(Housing_long, by = "District")
gc() # Clear memory
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 3889307 207.8 6434728 343.7 NA 6434728 343.7
## Vcells 6945409 53.0 14786712 112.9 16384 10146326 77.5
merged_data <- merged_data %>%
left_join(Traffic, by = "District")
## Warning in left_join(., Traffic, by = "District"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 4 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to
## silence this warning.
gc() # Clear memory
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 3892156 207.9 6434728 343.7 NA 6434728 343.7
## Vcells 15418564 117.7 24276887 185.3 16384 15798978 120.6
merged_data <- merged_data %>%
left_join(School_data_agg, by = "District")
gc() # Clear memory
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 3892186 207.9 6434728 343.7 NA 6434728 343.7
## Vcells 19742371 150.7 31621539 241.3 16384 28590010 218.2
tail(merged_data)
## District Total.population Male Female X..15 X15...24 X25...44 X45...64 X65.
## 360307 NORTH 309631 145343 164288 33499 27654 88314 104607 55557
## 360308 NORTH 309631 145343 164288 33499 27654 88314 104607 55557
## 360309 NORTH 309631 145343 164288 33499 27654 88314 104607 55557
## 360310 NORTH 309631 145343 164288 33499 27654 88314 104607 55557
## 360311 NORTH 309631 145343 164288 33499 27654 88314 104607 55557
## 360312 NORTH 309631 145343 164288 33499 27654 88314 104607 55557
## Divorces English.speaking No.schooling..Pre.primary Post.secondary..Degree.course
## 360307 17410 4330 11971 48482
## 360308 17410 4330 11971 48482
## 360309 17410 4330 11971 48482
## 360310 17410 4330 11971 48482
## 360311 17410 4330 11971 48482
## 360312 17410 4330 11971 48482
## Study.in.the.same.district.3. Working.population Employers REGION.x
## 360307 23134 146416 2622 NTE
## 360308 23134 146416 2622 NTE
## 360309 23134 146416 2622 NTE
## 360310 23134 146416 2622 NTE
## 360311 23134 146416 2622 NTE
## 360312 23134 146416 2622 NTE
## Post_Secondary_Percentage Rank_Score Cluster REGION.y Total_Parking Year
## 360307 15.65799 87855.86 2 NTE 1097 2023
## 360308 15.65799 87855.86 2 NTE 1097 2023
## 360309 15.65799 87855.86 2 NTE 1097 2023
## 360310 15.65799 87855.86 2 NTE 1097 2023
## 360311 15.65799 87855.86 2 NTE 1097 2023
## 360312 15.65799 87855.86 2 NTE 1097 2023
## Housing_Price REGION.x.x Date Control.Point Arrival./.Departure
## 360307 130015 NTE 45516 Heung Yuen Wai Arrival
## 360308 130015 NTE 45516 Heung Yuen Wai Departure
## 360309 130015 NTE 45516 Lok Ma Chau Arrival
## 360310 130015 NTE 45516 Lok Ma Chau Departure
## 360311 130015 NTE 45516 Sha Tau Kok Arrival
## 360312 130015 NTE 45516 Sha Tau Kok Departure
## Hong.Kong.Residents Mainland.Visitors Other.Visitors Total REGION.y.y
## 360307 43866 13650 227 57743 NTE
## 360308 33769 7563 232 41564 NTE
## 360309 27305 3390 386 31081 NTE
## 360310 13307 6242 608 20157 NTE
## 360311 0 0 0 0 NTE
## 360312 0 0 0 0 NTE
## Avg_SCHOOL_NO Most_Common_ENGLISH_CATEGORY Avg_SESSION_RANK
## 360307 357500863242 Kindergartens 8.113095
## 360308 357500863242 Kindergartens 8.113095
## 360309 357500863242 Kindergartens 8.113095
## 360310 357500863242 Kindergartens 8.113095
## 360311 357500863242 Kindergartens 8.113095
## 360312 357500863242 Kindergartens 8.113095
## Avg_ENGLISH_CATEGORY_RANK Avg_FINANCE_TYPE_RANK Avg_SCHOOL_LEVEL_RANK
## 360307 2.309524 6.005952 6.571429
## 360308 2.309524 6.005952 6.571429
## 360309 2.309524 6.005952 6.571429
## 360310 2.309524 6.005952 6.571429
## 360311 2.309524 6.005952 6.571429
## 360312 2.309524 6.005952 6.571429
## Avg_TOTAL_RANK Avg_RANK Most_Common_STUDENTS_GENDER Most_Common_SESSION
## 360307 23 866.375 CO-ED WHOLE DAY
## 360308 23 866.375 CO-ED WHOLE DAY
## 360309 23 866.375 CO-ED WHOLE DAY
## 360310 23 866.375 CO-ED WHOLE DAY
## 360311 23 866.375 CO-ED WHOLE DAY
## 360312 23 866.375 CO-ED WHOLE DAY
## Most_Common_FINANCE_TYPE Most_Common_SCHOOL_LEVEL
## 360307 PRIVATE KINDERGARTEN
## 360308 PRIVATE KINDERGARTEN
## 360309 PRIVATE KINDERGARTEN
## 360310 PRIVATE KINDERGARTEN
## 360311 PRIVATE KINDERGARTEN
## 360312 PRIVATE KINDERGARTEN
# View duplicated rows
duplicated_rows <- merged_data[duplicated(merged_data$District), ]
View(duplicated_rows)
# remove duplicates
#merged_data <- merged_data %>% distinct()
# Aggregate School Data
#merged_data <- merged_data %>%
#group_by(District) %>%
#summarize(across(everything(), ~ if(is.numeric(.)) mean(., na.rm = TRUE) else first(.)))
# Check for missing values
colSums(is.na(merged_data))
## District Total.population Male
## 0 0 0
## Female X..15 X15...24
## 0 0 0
## X25...44 X45...64 X65.
## 0 0 0
## Divorces English.speaking No.schooling..Pre.primary
## 0 0 0
## Post.secondary..Degree.course Study.in.the.same.district.3. Working.population
## 0 0 0
## Employers REGION.x Post_Secondary_Percentage
## 0 0 0
## Rank_Score Cluster REGION.y
## 0 0 0
## Total_Parking Year Housing_Price
## 0 812 812
## REGION.x.x Date Control.Point
## 812 6 6
## Arrival./.Departure Hong.Kong.Residents Mainland.Visitors
## 6 6 6
## Other.Visitors Total REGION.y.y
## 6 6 6
## Avg_SCHOOL_NO Most_Common_ENGLISH_CATEGORY Avg_SESSION_RANK
## 0 0 0
## Avg_ENGLISH_CATEGORY_RANK Avg_FINANCE_TYPE_RANK Avg_SCHOOL_LEVEL_RANK
## 0 0 0
## Avg_TOTAL_RANK Avg_RANK Most_Common_STUDENTS_GENDER
## 0 0 0
## Most_Common_SESSION Most_Common_FINANCE_TYPE Most_Common_SCHOOL_LEVEL
## 0 0 0
# Visualize missing data
library(naniar)
#vis_miss(merged_data)
# Impute missing values with the mean
# Impute missing values for numerical columns with the mean
merged_data <- merged_data %>%
mutate(
Total_Parking = ifelse(is.na(Total_Parking), mean(Total_Parking, na.rm = TRUE), Total_Parking),
Housing_Price = ifelse(is.na(Housing_Price), mean(Housing_Price, na.rm = TRUE), Housing_Price),
Avg_SCHOOL_NO = ifelse(is.na(Avg_SCHOOL_NO), mean(Avg_SCHOOL_NO, na.rm = TRUE), Avg_SCHOOL_NO),
Avg_SESSION_RANK = ifelse(is.na(Avg_SESSION_RANK), mean(Avg_SESSION_RANK, na.rm = TRUE), Avg_SESSION_RANK),
Avg_ENGLISH_CATEGORY_RANK = ifelse(is.na(Avg_ENGLISH_CATEGORY_RANK), mean(Avg_ENGLISH_CATEGORY_RANK, na.rm = TRUE), Avg_ENGLISH_CATEGORY_RANK),
Avg_FINANCE_TYPE_RANK = ifelse(is.na(Avg_FINANCE_TYPE_RANK), mean(Avg_FINANCE_TYPE_RANK, na.rm = TRUE), Avg_FINANCE_TYPE_RANK),
Avg_SCHOOL_LEVEL_RANK = ifelse(is.na(Avg_SCHOOL_LEVEL_RANK), mean(Avg_SCHOOL_LEVEL_RANK, na.rm = TRUE), Avg_SCHOOL_LEVEL_RANK),
Avg_TOTAL_RANK = ifelse(is.na(Avg_TOTAL_RANK), mean(Avg_TOTAL_RANK, na.rm = TRUE), Avg_TOTAL_RANK)
)
# Function to get the mode
get_mode <- function(x) {
uniq_x <- unique(x)
uniq_x[which.max(tabulate(match(x, uniq_x)))]
}
# Impute missing values for categorical columns with the mode
merged_data <- merged_data %>%
mutate(
Most_Common_ENGLISH_CATEGORY = ifelse(is.na(Most_Common_ENGLISH_CATEGORY), get_mode(Most_Common_ENGLISH_CATEGORY), Most_Common_ENGLISH_CATEGORY),
Most_Common_STUDENTS_GENDER = ifelse(is.na(Most_Common_STUDENTS_GENDER), get_mode(Most_Common_STUDENTS_GENDER), Most_Common_STUDENTS_GENDER),
Most_Common_SESSION = ifelse(is.na(Most_Common_SESSION), get_mode(Most_Common_SESSION), Most_Common_SESSION),
Most_Common_FINANCE_TYPE = ifelse(is.na(Most_Common_FINANCE_TYPE), get_mode(Most_Common_FINANCE_TYPE), Most_Common_FINANCE_TYPE),
Most_Common_SCHOOL_LEVEL = ifelse(is.na(Most_Common_SCHOOL_LEVEL), get_mode(Most_Common_SCHOOL_LEVEL), Most_Common_SCHOOL_LEVEL)
)
# Impute missing values for numerical columns with the mean
merged_data <- merged_data %>%
mutate(
Hong.Kong.Residents = ifelse(is.na(Hong.Kong.Residents), mean(Hong.Kong.Residents, na.rm = TRUE), Hong.Kong.Residents),
Mainland.Visitors = ifelse(is.na(Mainland.Visitors), mean(Mainland.Visitors, na.rm = TRUE), Mainland.Visitors),
Other.Visitors = ifelse(is.na(Other.Visitors), mean(Other.Visitors, na.rm = TRUE), Other.Visitors),
Total = ifelse(is.na(Total), mean(Total, na.rm = TRUE), Total),
Avg_SCHOOL_NO = ifelse(is.na(Avg_SCHOOL_NO), mean(Avg_SCHOOL_NO, na.rm = TRUE), Avg_SCHOOL_NO),
Avg_SESSION_RANK = ifelse(is.na(Avg_SESSION_RANK), mean(Avg_SESSION_RANK, na.rm = TRUE), Avg_SESSION_RANK),
Date = ifelse(is.na(Date), mean(Date, na.rm = TRUE), Date) # If Date is numeric, else use mode for categorical
)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Date = ifelse(is.na(Date), mean(Date, na.rm = TRUE), Date)`.
## Caused by warning in `mean.default()`:
## ! argument is not numeric or logical: returning NA
# Function to get the mode (for categorical data)
get_mode <- function(x) {
uniq_x <- unique(x)
uniq_x[which.max(tabulate(match(x, uniq_x)))]
}
# Impute missing values for categorical columns with the mode
merged_data <- merged_data %>%
mutate(
REGION.x.x = ifelse(is.na(REGION.x.x), get_mode(REGION.x.x), REGION.x.x),
REGION.y.y = ifelse(is.na(REGION.y.y), get_mode(REGION.y.y), REGION.y.y),
Control.Point = ifelse(is.na(Control.Point), get_mode(Control.Point), Control.Point),
Most_Common_ENGLISH_CATEGORY = ifelse(is.na(Most_Common_ENGLISH_CATEGORY), get_mode(Most_Common_ENGLISH_CATEGORY), Most_Common_ENGLISH_CATEGORY)
)
colnames(merged_data) <- make.names(colnames(merged_data)) # Replaces spaces and symbols with dots
# Check data types
str(merged_data)
## 'data.frame': 360312 obs. of 45 variables:
## $ District : chr "SHA TIN" "YUEN LONG" "SAI KUNG" "TUEN MUN" ...
## $ Total.population : int 692806 668080 489037 506879 506879 506879 506879 506879 506879 506879 ...
## $ Male : int 317927 314056 223094 240321 240321 240321 240321 240321 240321 240321 ...
## $ Female : int 374879 354024 265943 266558 266558 266558 266558 266558 266558 266558 ...
## $ X..15 : int 78886 77199 56329 54136 54136 54136 54136 54136 54136 54136 ...
## $ X15...24 : int 53696 58988 39744 38535 38535 38535 38535 38535 38535 38535 ...
## $ X25...44 : int 201135 211376 150700 152609 152609 152609 152609 152609 152609 152609 ...
## $ X45...64 : int 220604 220151 164991 163644 163644 163644 163644 163644 163644 163644 ...
## $ X65. : int 138485 100366 77273 97955 97955 97955 97955 97955 97955 97955 ...
## $ Divorces : int 33651 36117 19976 29619 29619 29619 29619 29619 29619 29619 ...
## $ English.speaking : int 21042 16754 28502 9038 9038 9038 9038 9038 9038 9038 ...
## $ No.schooling..Pre.primary : int 23269 24183 14160 20637 20637 20637 20637 20637 20637 20637 ...
## $ Post.secondary..Degree.course: int 154778 116916 128362 79104 79104 79104 79104 79104 79104 79104 ...
## $ Study.in.the.same.district.3.: int 56374 36841 33449 49082 49082 49082 49082 49082 49082 49082 ...
## $ Working.population : int 340605 332587 262126 247419 247419 247419 247419 247419 247419 247419 ...
## $ Employers : int 9333 8749 8206 5248 5248 5248 5248 5248 5248 5248 ...
## $ REGION.x : chr "NTE" "NTW" "NTE" "NTW" ...
## $ Post_Secondary_Percentage : num 22.3 17.5 26.2 15.6 15.6 ...
## $ Rank_Score : num 204372 199559 157286 148458 148458 ...
## $ Cluster : int 3 3 3 3 3 3 3 3 3 3 ...
## $ REGION.y : chr "NTE" "NTW" "NTE" "NTW" ...
## $ Total_Parking : int 1587 957 1203 1171 1171 1171 1171 1171 1171 1171 ...
## $ Year : int NA NA NA NA NA NA NA NA NA NA ...
## $ Housing_Price : num 75031 75031 75031 75031 75031 ...
## $ REGION.x.x : chr "NTE" "NTE" "NTE" "NTE" ...
## $ Date : chr NA NA NA "44197" ...
## $ Control.Point : chr "Lo Wu" "Lo Wu" "Lo Wu" "Tuen Mun Ferry Terminal" ...
## $ Arrival...Departure : chr NA NA NA "Arrival" ...
## $ Hong.Kong.Residents : num 13709 13709 13709 0 0 ...
## $ Mainland.Visitors : num 3690 3690 3690 0 0 ...
## $ Other.Visitors : num 138 138 138 0 0 ...
## $ Total : num 17537 17537 17537 0 0 ...
## $ REGION.y.y : chr "NTE" "NTE" "NTE" "NTW" ...
## $ Avg_SCHOOL_NO : num 3.50e+11 4.14e+11 4.65e+11 3.31e+11 3.31e+11 ...
## $ Most_Common_ENGLISH_CATEGORY : chr "Kindergartens" "Kindergartens" "Kindergarten-cum-child Care Centres" "Kindergartens" ...
## $ Avg_SESSION_RANK : num 8.08 8 7.79 8.27 8.27 ...
## $ Avg_ENGLISH_CATEGORY_RANK : num 2.47 2.28 2.18 2.35 2.35 ...
## $ Avg_FINANCE_TYPE_RANK : num 6.1 6 5.85 6.09 6.09 ...
## $ Avg_SCHOOL_LEVEL_RANK : num 6.88 6.66 6.62 6.78 6.78 ...
## $ Avg_TOTAL_RANK : num 23.5 22.9 22.4 23.5 23.5 ...
## $ Avg_RANK : num 822 873 896 828 828 ...
## $ Most_Common_STUDENTS_GENDER : chr "CO-ED" "CO-ED" "CO-ED" "CO-ED" ...
## $ Most_Common_SESSION : chr "WHOLE DAY" "WHOLE DAY" "WHOLE DAY" "WHOLE DAY" ...
## $ Most_Common_FINANCE_TYPE : chr "PRIVATE" "PRIVATE" "PRIVATE" "PRIVATE" ...
## $ Most_Common_SCHOOL_LEVEL : chr "KINDERGARTEN" "KINDERGARTEN" "KINDERGARTEN-CUM-CHILD CARE CENTRES" "KINDERGARTEN" ...
# Convert columns
merged_data <- merged_data %>%
mutate(
Year = as.integer(Year),
District = as.factor(District),
Housing_Price = as.numeric(Housing_Price),
Total_Parking = as.numeric(Total_Parking)
)
print(head(merged_data))
## District Total.population Male Female X..15 X15...24 X25...44 X45...64 X65.
## 1 SHA TIN 692806 317927 374879 78886 53696 201135 220604 138485
## 2 YUEN LONG 668080 314056 354024 77199 58988 211376 220151 100366
## 3 SAI KUNG 489037 223094 265943 56329 39744 150700 164991 77273
## 4 TUEN MUN 506879 240321 266558 54136 38535 152609 163644 97955
## 5 TUEN MUN 506879 240321 266558 54136 38535 152609 163644 97955
## 6 TUEN MUN 506879 240321 266558 54136 38535 152609 163644 97955
## Divorces English.speaking No.schooling..Pre.primary Post.secondary..Degree.course
## 1 33651 21042 23269 154778
## 2 36117 16754 24183 116916
## 3 19976 28502 14160 128362
## 4 29619 9038 20637 79104
## 5 29619 9038 20637 79104
## 6 29619 9038 20637 79104
## Study.in.the.same.district.3. Working.population Employers REGION.x
## 1 56374 340605 9333 NTE
## 2 36841 332587 8749 NTW
## 3 33449 262126 8206 NTE
## 4 49082 247419 5248 NTW
## 5 49082 247419 5248 NTW
## 6 49082 247419 5248 NTW
## Post_Secondary_Percentage Rank_Score Cluster REGION.y Total_Parking Year Housing_Price
## 1 22.34074 204371.9 3 NTE 1587 NA 75030.6
## 2 17.50030 199559.2 3 NTW 957 NA 75030.6
## 3 26.24791 157286.1 3 NTE 1203 NA 75030.6
## 4 15.60609 148457.6 3 NTW 1171 NA 75030.6
## 5 15.60609 148457.6 3 NTW 1171 NA 75030.6
## 6 15.60609 148457.6 3 NTW 1171 NA 75030.6
## REGION.x.x Date Control.Point Arrival...Departure Hong.Kong.Residents
## 1 NTE <NA> Lo Wu <NA> 13709.48
## 2 NTE <NA> Lo Wu <NA> 13709.48
## 3 NTE <NA> Lo Wu <NA> 13709.48
## 4 NTE 44197 Tuen Mun Ferry Terminal Arrival 0.00
## 5 NTE 44197 Tuen Mun Ferry Terminal Departure 0.00
## 6 NTE 44228 Tuen Mun Ferry Terminal Arrival 0.00
## Mainland.Visitors Other.Visitors Total REGION.y.y Avg_SCHOOL_NO
## 1 3689.72 137.6715 17536.87 NTE 350165730508
## 2 3689.72 137.6715 17536.87 NTE 414190289336
## 3 3689.72 137.6715 17536.87 NTE 464819423238
## 4 0.00 0.0000 0.00 NTW 330503680643
## 5 0.00 0.0000 0.00 NTW 330503680643
## 6 0.00 0.0000 0.00 NTW 330503680643
## Most_Common_ENGLISH_CATEGORY Avg_SESSION_RANK Avg_ENGLISH_CATEGORY_RANK
## 1 Kindergartens 8.075085 2.467577
## 2 Kindergartens 8.000000 2.282230
## 3 Kindergarten-cum-child Care Centres 7.790598 2.179487
## 4 Kindergartens 8.269710 2.348548
## 5 Kindergartens 8.269710 2.348548
## 6 Kindergartens 8.269710 2.348548
## Avg_FINANCE_TYPE_RANK Avg_SCHOOL_LEVEL_RANK Avg_TOTAL_RANK Avg_RANK
## 1 6.095563 6.877133 23.51536 821.7440
## 2 6.003484 6.655052 22.94077 873.0767
## 3 5.850427 6.615385 22.43590 896.3590
## 4 6.087137 6.784232 23.48963 828.1079
## 5 6.087137 6.784232 23.48963 828.1079
## 6 6.087137 6.784232 23.48963 828.1079
## Most_Common_STUDENTS_GENDER Most_Common_SESSION Most_Common_FINANCE_TYPE
## 1 CO-ED WHOLE DAY PRIVATE
## 2 CO-ED WHOLE DAY PRIVATE
## 3 CO-ED WHOLE DAY PRIVATE
## 4 CO-ED WHOLE DAY PRIVATE
## 5 CO-ED WHOLE DAY PRIVATE
## 6 CO-ED WHOLE DAY PRIVATE
## Most_Common_SCHOOL_LEVEL
## 1 KINDERGARTEN
## 2 KINDERGARTEN
## 3 KINDERGARTEN-CUM-CHILD CARE CENTRES
## 4 KINDERGARTEN
## 5 KINDERGARTEN
## 6 KINDERGARTEN
# Check missing values again
colSums(is.na(merged_data))
## District Total.population Male
## 0 0 0
## Female X..15 X15...24
## 0 0 0
## X25...44 X45...64 X65.
## 0 0 0
## Divorces English.speaking No.schooling..Pre.primary
## 0 0 0
## Post.secondary..Degree.course Study.in.the.same.district.3. Working.population
## 0 0 0
## Employers REGION.x Post_Secondary_Percentage
## 0 0 0
## Rank_Score Cluster REGION.y
## 0 0 0
## Total_Parking Year Housing_Price
## 0 812 0
## REGION.x.x Date Control.Point
## 0 6 0
## Arrival...Departure Hong.Kong.Residents Mainland.Visitors
## 6 0 0
## Other.Visitors Total REGION.y.y
## 0 0 0
## Avg_SCHOOL_NO Most_Common_ENGLISH_CATEGORY Avg_SESSION_RANK
## 0 0 0
## Avg_ENGLISH_CATEGORY_RANK Avg_FINANCE_TYPE_RANK Avg_SCHOOL_LEVEL_RANK
## 0 0 0
## Avg_TOTAL_RANK Avg_RANK Most_Common_STUDENTS_GENDER
## 0 0 0
## Most_Common_SESSION Most_Common_FINANCE_TYPE Most_Common_SCHOOL_LEVEL
## 0 0 0
#############EXPLORATORY DATA ANALYSIS
#Data Overview
str(merged_data)
## 'data.frame': 360312 obs. of 45 variables:
## $ District : Factor w/ 8 levels "KWAI TSING","NORTH",..: 4 8 3 7 7 7 7 7 7 7 ...
## $ Total.population : int 692806 668080 489037 506879 506879 506879 506879 506879 506879 506879 ...
## $ Male : int 317927 314056 223094 240321 240321 240321 240321 240321 240321 240321 ...
## $ Female : int 374879 354024 265943 266558 266558 266558 266558 266558 266558 266558 ...
## $ X..15 : int 78886 77199 56329 54136 54136 54136 54136 54136 54136 54136 ...
## $ X15...24 : int 53696 58988 39744 38535 38535 38535 38535 38535 38535 38535 ...
## $ X25...44 : int 201135 211376 150700 152609 152609 152609 152609 152609 152609 152609 ...
## $ X45...64 : int 220604 220151 164991 163644 163644 163644 163644 163644 163644 163644 ...
## $ X65. : int 138485 100366 77273 97955 97955 97955 97955 97955 97955 97955 ...
## $ Divorces : int 33651 36117 19976 29619 29619 29619 29619 29619 29619 29619 ...
## $ English.speaking : int 21042 16754 28502 9038 9038 9038 9038 9038 9038 9038 ...
## $ No.schooling..Pre.primary : int 23269 24183 14160 20637 20637 20637 20637 20637 20637 20637 ...
## $ Post.secondary..Degree.course: int 154778 116916 128362 79104 79104 79104 79104 79104 79104 79104 ...
## $ Study.in.the.same.district.3.: int 56374 36841 33449 49082 49082 49082 49082 49082 49082 49082 ...
## $ Working.population : int 340605 332587 262126 247419 247419 247419 247419 247419 247419 247419 ...
## $ Employers : int 9333 8749 8206 5248 5248 5248 5248 5248 5248 5248 ...
## $ REGION.x : chr "NTE" "NTW" "NTE" "NTW" ...
## $ Post_Secondary_Percentage : num 22.3 17.5 26.2 15.6 15.6 ...
## $ Rank_Score : num 204372 199559 157286 148458 148458 ...
## $ Cluster : int 3 3 3 3 3 3 3 3 3 3 ...
## $ REGION.y : chr "NTE" "NTW" "NTE" "NTW" ...
## $ Total_Parking : num 1587 957 1203 1171 1171 ...
## $ Year : int NA NA NA NA NA NA NA NA NA NA ...
## $ Housing_Price : num 75031 75031 75031 75031 75031 ...
## $ REGION.x.x : chr "NTE" "NTE" "NTE" "NTE" ...
## $ Date : chr NA NA NA "44197" ...
## $ Control.Point : chr "Lo Wu" "Lo Wu" "Lo Wu" "Tuen Mun Ferry Terminal" ...
## $ Arrival...Departure : chr NA NA NA "Arrival" ...
## $ Hong.Kong.Residents : num 13709 13709 13709 0 0 ...
## $ Mainland.Visitors : num 3690 3690 3690 0 0 ...
## $ Other.Visitors : num 138 138 138 0 0 ...
## $ Total : num 17537 17537 17537 0 0 ...
## $ REGION.y.y : chr "NTE" "NTE" "NTE" "NTW" ...
## $ Avg_SCHOOL_NO : num 3.50e+11 4.14e+11 4.65e+11 3.31e+11 3.31e+11 ...
## $ Most_Common_ENGLISH_CATEGORY : chr "Kindergartens" "Kindergartens" "Kindergarten-cum-child Care Centres" "Kindergartens" ...
## $ Avg_SESSION_RANK : num 8.08 8 7.79 8.27 8.27 ...
## $ Avg_ENGLISH_CATEGORY_RANK : num 2.47 2.28 2.18 2.35 2.35 ...
## $ Avg_FINANCE_TYPE_RANK : num 6.1 6 5.85 6.09 6.09 ...
## $ Avg_SCHOOL_LEVEL_RANK : num 6.88 6.66 6.62 6.78 6.78 ...
## $ Avg_TOTAL_RANK : num 23.5 22.9 22.4 23.5 23.5 ...
## $ Avg_RANK : num 822 873 896 828 828 ...
## $ Most_Common_STUDENTS_GENDER : chr "CO-ED" "CO-ED" "CO-ED" "CO-ED" ...
## $ Most_Common_SESSION : chr "WHOLE DAY" "WHOLE DAY" "WHOLE DAY" "WHOLE DAY" ...
## $ Most_Common_FINANCE_TYPE : chr "PRIVATE" "PRIVATE" "PRIVATE" "PRIVATE" ...
## $ Most_Common_SCHOOL_LEVEL : chr "KINDERGARTEN" "KINDERGARTEN" "KINDERGARTEN-CUM-CHILD CARE CENTRES" "KINDERGARTEN" ...
dim(merged_data)
## [1] 360312 45
#chcek missing
colSums(is.na(merged_data))
## District Total.population Male
## 0 0 0
## Female X..15 X15...24
## 0 0 0
## X25...44 X45...64 X65.
## 0 0 0
## Divorces English.speaking No.schooling..Pre.primary
## 0 0 0
## Post.secondary..Degree.course Study.in.the.same.district.3. Working.population
## 0 0 0
## Employers REGION.x Post_Secondary_Percentage
## 0 0 0
## Rank_Score Cluster REGION.y
## 0 0 0
## Total_Parking Year Housing_Price
## 0 812 0
## REGION.x.x Date Control.Point
## 0 6 0
## Arrival...Departure Hong.Kong.Residents Mainland.Visitors
## 6 0 0
## Other.Visitors Total REGION.y.y
## 0 0 0
## Avg_SCHOOL_NO Most_Common_ENGLISH_CATEGORY Avg_SESSION_RANK
## 0 0 0
## Avg_ENGLISH_CATEGORY_RANK Avg_FINANCE_TYPE_RANK Avg_SCHOOL_LEVEL_RANK
## 0 0 0
## Avg_TOTAL_RANK Avg_RANK Most_Common_STUDENTS_GENDER
## 0 0 0
## Most_Common_SESSION Most_Common_FINANCE_TYPE Most_Common_SCHOOL_LEVEL
## 0 0 0
#Summary statistics
summary(merged_data)
## District Total.population Male Female X..15
## NORTH :359500 Min. :309631 Min. :145343 Min. :164288 Min. :33499
## TUEN MUN : 806 1st Qu.:309631 1st Qu.:145343 1st Qu.:164288 1st Qu.:33499
## KWAI TSING: 1 Median :309631 Median :145343 Median :164288 Median :33499
## SAI KUNG : 1 Mean :310075 Mean :145557 Mean :164518 Mean :33546
## SHA TIN : 1 3rd Qu.:309631 3rd Qu.:145343 3rd Qu.:164288 3rd Qu.:33499
## TAI PO : 1 Max. :692806 Max. :317927 Max. :374879 Max. :78886
## (Other) : 2
## X15...24 X25...44 X45...64 X65. Divorces
## Min. :23198 Min. : 88314 Min. :102810 Min. : 55557 Min. :13173
## 1st Qu.:27654 1st Qu.: 88314 1st Qu.:104607 1st Qu.: 55557 1st Qu.:17410
## Median :27654 Median : 88314 Median :104607 Median : 55557 Median :17410
## Mean :27679 Mean : 88459 Mean :104740 Mean : 55652 Mean :17437
## 3rd Qu.:27654 3rd Qu.: 88314 3rd Qu.:104607 3rd Qu.: 55557 3rd Qu.:17410
## Max. :58988 Max. :211376 Max. :220604 Max. :138485 Max. :36117
##
## English.speaking No.schooling..Pre.primary Post.secondary..Degree.course
## Min. : 4330 Min. :10535 Min. : 48482
## 1st Qu.: 4330 1st Qu.:11971 1st Qu.: 48482
## Median : 4330 Median :11971 Median : 48482
## Mean : 4341 Mean :11990 Mean : 48551
## 3rd Qu.: 4330 3rd Qu.:11971 3rd Qu.: 48482
## Max. :28502 Max. :27431 Max. :154778
##
## Study.in.the.same.district.3. Working.population Employers REGION.x
## Min. :22365 Min. :146416 Min. :2622 Length:360312
## 1st Qu.:23134 1st Qu.:146416 1st Qu.:2622 Class :character
## Median :23134 Median :146416 Median :2622 Mode :character
## Mean :23192 Mean :146644 Mean :2628
## 3rd Qu.:23134 3rd Qu.:146416 3rd Qu.:2622
## Max. :56374 Max. :340605 Max. :9333
##
## Post_Secondary_Percentage Rank_Score Cluster REGION.y
## Min. :14.78 Min. : 87856 Min. :1.000 Length:360312
## 1st Qu.:15.66 1st Qu.: 87856 1st Qu.:2.000 Class :character
## Median :15.66 Median : 87856 Median :2.000 Mode :character
## Mean :15.66 Mean : 87992 Mean :2.002
## 3rd Qu.:15.66 3rd Qu.: 87856 3rd Qu.:2.000
## Max. :26.25 Max. :204372 Max. :3.000
##
## Total_Parking Year Housing_Price REGION.x.x Date
## Min. : 506 Min. :1999 Min. : 20843 Length:360312 Length:360312
## 1st Qu.:1097 1st Qu.:2005 1st Qu.: 31444 Class :character Class :character
## Median :1097 Median :2011 Median : 58093 Mode :character Mode :character
## Mean :1097 Mean :2011 Mean : 75031
## 3rd Qu.:1097 3rd Qu.:2017 3rd Qu.:121633
## Max. :1587 Max. :2023 Max. :152765
## NA's :812
## Control.Point Arrival...Departure Hong.Kong.Residents Mainland.Visitors
## Length:360312 Length:360312 Min. : 0 Min. : 0
## Class :character Class :character 1st Qu.: 0 1st Qu.: 0
## Mode :character Mode :character Median : 0 Median : 0
## Mean : 13709 Mean : 3690
## 3rd Qu.: 19174 3rd Qu.: 4452
## Max. :155071 Max. :53608
##
## Other.Visitors Total REGION.y.y Avg_SCHOOL_NO
## Min. : 0.0 Min. : 0 Length:360312 Min. :3.305e+11
## 1st Qu.: 0.0 1st Qu.: 0 Class :character 1st Qu.:3.575e+11
## Median : 0.0 Median : 0 Mode :character Median :3.575e+11
## Mean : 137.7 Mean : 17537 Mean :3.574e+11
## 3rd Qu.: 259.0 3rd Qu.: 24286 3rd Qu.:3.575e+11
## Max. :1263.0 Max. :168435 Max. :4.648e+11
##
## Most_Common_ENGLISH_CATEGORY Avg_SESSION_RANK Avg_ENGLISH_CATEGORY_RANK
## Length:360312 Min. :7.791 Min. :2.176
## Class :character 1st Qu.:8.113 1st Qu.:2.310
## Mode :character Median :8.113 Median :2.310
## Mean :8.113 Mean :2.310
## 3rd Qu.:8.113 3rd Qu.:2.310
## Max. :8.317 Max. :2.655
##
## Avg_FINANCE_TYPE_RANK Avg_SCHOOL_LEVEL_RANK Avg_TOTAL_RANK Avg_RANK
## Min. :5.850 Min. :6.571 Min. :22.44 Min. :785.0
## 1st Qu.:6.006 1st Qu.:6.571 1st Qu.:23.00 1st Qu.:866.4
## Median :6.006 Median :6.571 Median :23.00 Median :866.4
## Mean :6.006 Mean :6.572 Mean :23.00 Mean :866.3
## 3rd Qu.:6.006 3rd Qu.:6.571 3rd Qu.:23.00 3rd Qu.:866.4
## Max. :6.096 Max. :6.952 Max. :23.96 Max. :896.4
##
## Most_Common_STUDENTS_GENDER Most_Common_SESSION Most_Common_FINANCE_TYPE
## Length:360312 Length:360312 Length:360312
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Most_Common_SCHOOL_LEVEL
## Length:360312
## Class :character
## Mode :character
##
##
##
##
#Univariate analysis
table(merged_data$REGION.y,merged_data$REGION.x)
##
## NTE NTW
## NTE 359503 0
## NTW 0 809
barplot(table(merged_data$REGION.y,merged_data$REGION.x))
hist(merged_data$`Total.population`, main = "Total Population Distribution", xlab = "Total Population")
boxplot(merged_data$`Housing_Price`, main = "Housing Price Distribution", ylab = "Price")
#Outlier detection
boxplot(merged_data$Housing_Price, main = "Boxplot for Housing Prices")
#Bivariate Analysis
#Numerical vs numerical
plot(merged_data$`Housing_Price`, merged_data$`Total.population`,
main = "Housing Price vs Total Population",
xlab = "Housing Price", ylab = "Total Population")
cor(merged_data$Housing_Price, merged_data$`Total.population`, use = "complete.obs")
## [1] 6.673583e-15
#correlation between housing price and totalpopulation=0.4719672
#Categorical vs. Numerical:
#Boxplot of housing prices for each region
boxplot(Housing_Price ~ REGION.x, data = merged_data,
main = "Housing Prices by Region",
xlab = "Region", ylab = "Housing Price")
# columns that are "constant" (have only one unique value excluding NA)
constant_cols <- sapply(merged_data, function(col) length(unique(na.omit(col))) == 1)
# Drop these constant columns
merged_data <- merged_data[, !constant_cols]
#Multivariate Analysis
#Heatmap of correlations between numerical variables:
library(ggcorrplot)
num_data <- merged_data %>% select_if(is.numeric)
constant_cols <- sapply(num_data, function(x) sd(x, na.rm = TRUE) == 0)
constant_cols
## Total.population Male Female
## FALSE FALSE FALSE
## X..15 X15...24 X25...44
## FALSE FALSE FALSE
## X45...64 X65. Divorces
## FALSE FALSE FALSE
## English.speaking No.schooling..Pre.primary Post.secondary..Degree.course
## FALSE FALSE FALSE
## Study.in.the.same.district.3. Working.population Employers
## FALSE FALSE FALSE
## Post_Secondary_Percentage Rank_Score Cluster
## FALSE FALSE FALSE
## Total_Parking Year Housing_Price
## FALSE FALSE FALSE
## Hong.Kong.Residents Mainland.Visitors Other.Visitors
## FALSE FALSE FALSE
## Total Avg_SCHOOL_NO Avg_SESSION_RANK
## FALSE FALSE FALSE
## Avg_ENGLISH_CATEGORY_RANK Avg_FINANCE_TYPE_RANK Avg_SCHOOL_LEVEL_RANK
## FALSE FALSE FALSE
## Avg_TOTAL_RANK Avg_RANK
## FALSE FALSE
num_data <- num_data[, !constant_cols]
corr_matrix <- cor(num_data, use = "complete.obs")
## Warning in cor(num_data, use = "complete.obs"): the standard deviation is zero
ggcorrplot(corr_matrix, lab = TRUE)
#Principal Component Analysis (PCA) for dimensionality reduction and pattern discovery:
library(FactoMineR)
pca_results <- PCA(merged_data %>% select_if(is.numeric), graph = TRUE)
## Warning in PCA(merged_data %>% select_if(is.numeric), graph = TRUE): Missing values are
## imputed by the mean of the variable: you should use the imputePCA function of the missMDA
## package
## Warning: ggrepel: 24 unlabeled data points (too many overlaps). Consider increasing
## max.overlaps
colnames(merged_data)
## [1] "District" "Total.population"
## [3] "Male" "Female"
## [5] "X..15" "X15...24"
## [7] "X25...44" "X45...64"
## [9] "X65." "Divorces"
## [11] "English.speaking" "No.schooling..Pre.primary"
## [13] "Post.secondary..Degree.course" "Study.in.the.same.district.3."
## [15] "Working.population" "Employers"
## [17] "REGION.x" "Post_Secondary_Percentage"
## [19] "Rank_Score" "Cluster"
## [21] "REGION.y" "Total_Parking"
## [23] "Year" "Housing_Price"
## [25] "Date" "Control.Point"
## [27] "Arrival...Departure" "Hong.Kong.Residents"
## [29] "Mainland.Visitors" "Other.Visitors"
## [31] "Total" "REGION.y.y"
## [33] "Avg_SCHOOL_NO" "Most_Common_ENGLISH_CATEGORY"
## [35] "Avg_SESSION_RANK" "Avg_ENGLISH_CATEGORY_RANK"
## [37] "Avg_FINANCE_TYPE_RANK" "Avg_SCHOOL_LEVEL_RANK"
## [39] "Avg_TOTAL_RANK" "Avg_RANK"
## [41] "Most_Common_SCHOOL_LEVEL"
#Key Insights and Visualization
ggplot(merged_data, aes(x = Housing_Price)) +
geom_histogram(binwidth = 5000, fill = "steelblue", color = "white") +
labs(title = "Distribution of Housing Prices", x = "Housing_Price", y = "Frequency")
#Interactive plot#s
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
#plot_ly(merged_data, x = ~Housing_Price, y = ~Total.population, type = 'scatter', mode = 'markers')
#Correlation analysis
# Calculate Pearson's correlation for relevant variables
# Calculate Pearson's correlation for relevant numeric variables
correlation_matrix <- cor(merged_data[c("Housing_Price", "Total.population", "Total",
"Total_Parking", "Post.secondary..Degree.course",
"Rank_Score", "Working.population", "Divorces",
"Hong.Kong.Residents", "Mainland.Visitors",
"Other.Visitors", "Avg_SCHOOL_NO",
"Avg_SESSION_RANK", "Avg_ENGLISH_CATEGORY_RANK",
"Avg_FINANCE_TYPE_RANK", "Avg_SCHOOL_LEVEL_RANK",
"Avg_TOTAL_RANK", "Avg_RANK")],
use = "complete.obs")
# View the correlation matrix
print(correlation_matrix)
## Housing_Price Total.population Total Total_Parking
## Housing_Price 1.000000e+00 6.673583e-15 2.717929e-15 -3.735370e-14
## Total.population 6.673583e-15 1.000000e+00 -2.793183e-02 8.962105e-01
## Total 2.717929e-15 -2.793183e-02 1.000000e+00 -2.521344e-02
## Total_Parking -3.735370e-14 8.962105e-01 -2.521344e-02 1.000000e+00
## Post.secondary..Degree.course -4.406838e-14 9.961201e-01 -2.764660e-02 8.964682e-01
## Rank_Score 1.354517e-14 9.999503e-01 -2.792485e-02 8.961605e-01
## Working.population -5.169219e-15 9.999503e-01 -2.792486e-02 8.961606e-01
## Divorces -7.159718e-14 9.992776e-01 -2.799620e-02 8.964774e-01
## Hong.Kong.Residents 1.251160e-14 -2.755736e-02 9.944558e-01 -2.487541e-02
## Mainland.Visitors -3.351274e-15 -2.649244e-02 9.266113e-01 -2.391414e-02
## Other.Visitors -9.362999e-15 -2.875393e-02 8.482762e-01 -2.595553e-02
## Avg_SCHOOL_NO -5.964421e-15 -9.736710e-01 2.771496e-02 -8.902775e-01
## Avg_SESSION_RANK 4.761092e-14 9.845755e-01 -2.792219e-02 9.077867e-01
## Avg_ENGLISH_CATEGORY_RANK 3.059107e-14 9.270314e-01 -2.615259e-02 9.082421e-01
## Avg_FINANCE_TYPE_RANK -2.455677e-14 9.916551e-01 -2.798013e-02 9.066681e-01
## Avg_SCHOOL_LEVEL_RANK 3.031967e-14 9.958140e-01 -2.798463e-02 9.075818e-01
## Avg_TOTAL_RANK 5.649427e-15 9.915495e-01 -2.797322e-02 9.129204e-01
## Avg_RANK 2.182079e-14 -9.917905e-01 2.796856e-02 -9.159251e-01
## Post.secondary..Degree.course Rank_Score
## Housing_Price -4.406838e-14 1.354517e-14
## Total.population 9.961201e-01 9.999503e-01
## Total -2.764660e-02 -2.792485e-02
## Total_Parking 8.964682e-01 8.961605e-01
## Post.secondary..Degree.course 1.000000e+00 9.967163e-01
## Rank_Score 9.967163e-01 1.000000e+00
## Working.population 9.967160e-01 1.000000e+00
## Divorces 9.921101e-01 9.989595e-01
## Hong.Kong.Residents -2.727595e-02 -2.755048e-02
## Mainland.Visitors -2.622191e-02 -2.648582e-02
## Other.Visitors -2.846030e-02 -2.874675e-02
## Avg_SCHOOL_NO -9.549889e-01 -9.721736e-01
## Avg_SESSION_RANK 9.686359e-01 9.835895e-01
## Avg_ENGLISH_CATEGORY_RANK 9.263968e-01 9.271543e-01
## Avg_FINANCE_TYPE_RANK 9.788458e-01 9.906988e-01
## Avg_SCHOOL_LEVEL_RANK 9.907283e-01 9.958380e-01
## Avg_TOTAL_RANK 9.820547e-01 9.910959e-01
## Avg_RANK -9.834338e-01 -9.914752e-01
## Working.population Divorces Hong.Kong.Residents
## Housing_Price -5.169219e-15 -7.159718e-14 1.251160e-14
## Total.population 9.999503e-01 9.992776e-01 -2.755736e-02
## Total -2.792486e-02 -2.799620e-02 9.944558e-01
## Total_Parking 8.961606e-01 8.964774e-01 -2.487541e-02
## Post.secondary..Degree.course 9.967160e-01 9.921101e-01 -2.727595e-02
## Rank_Score 1.000000e+00 9.989595e-01 -2.755048e-02
## Working.population 1.000000e+00 9.989597e-01 -2.755048e-02
## Divorces 9.989597e-01 1.000000e+00 -2.762087e-02
## Hong.Kong.Residents -2.755048e-02 -2.762087e-02 1.000000e+00
## Mainland.Visitors -2.648583e-02 -2.655350e-02 8.819800e-01
## Other.Visitors -2.874675e-02 -2.882020e-02 8.434681e-01
## Avg_SCHOOL_NO -9.721742e-01 -9.797933e-01 2.734340e-02
## Avg_SESSION_RANK 9.835900e-01 9.894538e-01 -2.754785e-02
## Avg_ENGLISH_CATEGORY_RANK 9.271541e-01 9.260159e-01 -2.580198e-02
## Avg_FINANCE_TYPE_RANK 9.906992e-01 9.950895e-01 -2.760501e-02
## Avg_SCHOOL_LEVEL_RANK 9.958380e-01 9.958970e-01 -2.760946e-02
## Avg_TOTAL_RANK 9.910961e-01 9.936316e-01 -2.759820e-02
## Avg_RANK -9.914754e-01 -9.934287e-01 2.759360e-02
## Mainland.Visitors Other.Visitors Avg_SCHOOL_NO
## Housing_Price -3.351274e-15 -9.362999e-15 -5.964421e-15
## Total.population -2.649244e-02 -2.875393e-02 -9.736710e-01
## Total 9.266113e-01 8.482762e-01 2.771496e-02
## Total_Parking -2.391414e-02 -2.595553e-02 -8.902775e-01
## Post.secondary..Degree.course -2.622191e-02 -2.846030e-02 -9.549889e-01
## Rank_Score -2.648582e-02 -2.874675e-02 -9.721736e-01
## Working.population -2.648583e-02 -2.874675e-02 -9.721742e-01
## Divorces -2.655350e-02 -2.882020e-02 -9.797933e-01
## Hong.Kong.Residents 8.819800e-01 8.434681e-01 2.734340e-02
## Mainland.Visitors 1.000000e+00 7.767572e-01 2.628675e-02
## Other.Visitors 7.767572e-01 1.000000e+00 2.853068e-02
## Avg_SCHOOL_NO 2.628675e-02 2.853068e-02 1.000000e+00
## Avg_SESSION_RANK -2.648330e-02 -2.874401e-02 -9.955158e-01
## Avg_ENGLISH_CATEGORY_RANK -2.480490e-02 -2.692233e-02 -9.423869e-01
## Avg_FINANCE_TYPE_RANK -2.653825e-02 -2.880365e-02 -9.941441e-01
## Avg_SCHOOL_LEVEL_RANK -2.654253e-02 -2.880829e-02 -9.834260e-01
## Avg_TOTAL_RANK -2.653170e-02 -2.879654e-02 -9.913951e-01
## Avg_RANK 2.652728e-02 2.879174e-02 9.896727e-01
## Avg_SESSION_RANK Avg_ENGLISH_CATEGORY_RANK
## Housing_Price 4.761092e-14 3.059107e-14
## Total.population 9.845755e-01 9.270314e-01
## Total -2.792219e-02 -2.615259e-02
## Total_Parking 9.077867e-01 9.082421e-01
## Post.secondary..Degree.course 9.686359e-01 9.263968e-01
## Rank_Score 9.835895e-01 9.271543e-01
## Working.population 9.835900e-01 9.271541e-01
## Divorces 9.894538e-01 9.260159e-01
## Hong.Kong.Residents -2.754785e-02 -2.580198e-02
## Mainland.Visitors -2.648330e-02 -2.480490e-02
## Other.Visitors -2.874401e-02 -2.692233e-02
## Avg_SCHOOL_NO -9.955158e-01 -9.423869e-01
## Avg_SESSION_RANK 1.000000e+00 9.413553e-01
## Avg_ENGLISH_CATEGORY_RANK 9.413553e-01 1.000000e+00
## Avg_FINANCE_TYPE_RANK 9.976824e-01 9.402989e-01
## Avg_SCHOOL_LEVEL_RANK 9.912731e-01 9.527621e-01
## Avg_TOTAL_RANK 9.967397e-01 9.567032e-01
## Avg_RANK -9.957855e-01 -9.581160e-01
## Avg_FINANCE_TYPE_RANK Avg_SCHOOL_LEVEL_RANK Avg_TOTAL_RANK
## Housing_Price -2.455677e-14 3.031967e-14 5.649427e-15
## Total.population 9.916551e-01 9.958140e-01 9.915495e-01
## Total -2.798013e-02 -2.798463e-02 -2.797322e-02
## Total_Parking 9.066681e-01 9.075818e-01 9.129204e-01
## Post.secondary..Degree.course 9.788458e-01 9.907283e-01 9.820547e-01
## Rank_Score 9.906988e-01 9.958380e-01 9.910959e-01
## Working.population 9.906992e-01 9.958380e-01 9.910961e-01
## Divorces 9.950895e-01 9.958970e-01 9.936316e-01
## Hong.Kong.Residents -2.760501e-02 -2.760946e-02 -2.759820e-02
## Mainland.Visitors -2.653825e-02 -2.654253e-02 -2.653170e-02
## Other.Visitors -2.880365e-02 -2.880829e-02 -2.879654e-02
## Avg_SCHOOL_NO -9.941441e-01 -9.834260e-01 -9.913951e-01
## Avg_SESSION_RANK 9.976824e-01 9.912731e-01 9.967397e-01
## Avg_ENGLISH_CATEGORY_RANK 9.402989e-01 9.527621e-01 9.567032e-01
## Avg_FINANCE_TYPE_RANK 1.000000e+00 9.944607e-01 9.976760e-01
## Avg_SCHOOL_LEVEL_RANK 9.944607e-01 1.000000e+00 9.981730e-01
## Avg_TOTAL_RANK 9.976760e-01 9.981730e-01 1.000000e+00
## Avg_RANK -9.967574e-01 -9.986619e-01 -9.998748e-01
## Avg_RANK
## Housing_Price 2.182079e-14
## Total.population -9.917905e-01
## Total 2.796856e-02
## Total_Parking -9.159251e-01
## Post.secondary..Degree.course -9.834338e-01
## Rank_Score -9.914752e-01
## Working.population -9.914754e-01
## Divorces -9.934287e-01
## Hong.Kong.Residents 2.759360e-02
## Mainland.Visitors 2.652728e-02
## Other.Visitors 2.879174e-02
## Avg_SCHOOL_NO 9.896727e-01
## Avg_SESSION_RANK -9.957855e-01
## Avg_ENGLISH_CATEGORY_RANK -9.581160e-01
## Avg_FINANCE_TYPE_RANK -9.967574e-01
## Avg_SCHOOL_LEVEL_RANK -9.986619e-01
## Avg_TOTAL_RANK -9.998748e-01
## Avg_RANK 1.000000e+00
#Visualize the correlation matrix
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(ggplot2)
# Reshape the correlation matrix for ggplot
correlation_melt <- melt(correlation_matrix)
ggplot(correlation_melt, aes(Var1, Var2, fill = value)) +
geom_tile() +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0) +
theme_minimal() +
labs(title = "Correlation Heatmap", x = "", y = "")
#Statistical Tests
#t-test
#comparing house prices between New Territories East (NTE) and New Territories West (NTW).
# Perform t-test for house prices between NTE and NTW
colnames(Housing_long)
## [1] "Year" "District" "Housing_Price" "REGION"
t_test_result <- t.test(Housing_Price ~REGION.x, data = merged_data)
print(t_test_result)
##
## Welch Two Sample t-test
##
## data: Housing_Price by REGION.x
## t = -2.4357e-09, df = 359502, p-value = 1
## alternative hypothesis: true difference in means between group NTE and group NTW is not equal to 0
## 95 percent confidence interval:
## -154.2993 154.2993
## sample estimates:
## mean in group NTE mean in group NTW
## 75030.6 75030.6
#If the p-value is less than 0.05, it suggests that the means between the two regions are significantly different
#ANOVA
# ANOVA to compare house prices across different districts
anova_result <- aov(Housing_Price ~ District, data = merged_data)
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## District 7 0.00e+00 0.000e+00 0 1
## Residuals 360304 8.01e+14 2.223e+09
# ANOVA table
anova_table <- anova(anova_result)
print(anova_table)
## Analysis of Variance Table
##
## Response: Housing_Price
## Df Sum Sq Mean Sq F value Pr(>F)
## District 7 0.00e+00 0 0 1
## Residuals 360304 8.01e+14 2223122122
colnames(merged_data)
## [1] "District" "Total.population"
## [3] "Male" "Female"
## [5] "X..15" "X15...24"
## [7] "X25...44" "X45...64"
## [9] "X65." "Divorces"
## [11] "English.speaking" "No.schooling..Pre.primary"
## [13] "Post.secondary..Degree.course" "Study.in.the.same.district.3."
## [15] "Working.population" "Employers"
## [17] "REGION.x" "Post_Secondary_Percentage"
## [19] "Rank_Score" "Cluster"
## [21] "REGION.y" "Total_Parking"
## [23] "Year" "Housing_Price"
## [25] "Date" "Control.Point"
## [27] "Arrival...Departure" "Hong.Kong.Residents"
## [29] "Mainland.Visitors" "Other.Visitors"
## [31] "Total" "REGION.y.y"
## [33] "Avg_SCHOOL_NO" "Most_Common_ENGLISH_CATEGORY"
## [35] "Avg_SESSION_RANK" "Avg_ENGLISH_CATEGORY_RANK"
## [37] "Avg_FINANCE_TYPE_RANK" "Avg_SCHOOL_LEVEL_RANK"
## [39] "Avg_TOTAL_RANK" "Avg_RANK"
## [41] "Most_Common_SCHOOL_LEVEL"
##Linear regression
# Fit a linear regression model
# Fit a linear regression model
model <- lm(Housing_Price ~ Hong.Kong.Residents + Mainland.Visitors + Avg_SCHOOL_LEVEL_RANK, data = merged_data)
summary(model)
##
## Call:
## lm(formula = Housing_Price ~ Hong.Kong.Residents + Mainland.Visitors +
## Avg_SCHOOL_LEVEL_RANK, data = merged_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -54188 -43587 -16938 46602 77734
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.503e+04 5.119e+04 1.466 0.143
## Hong.Kong.Residents 1.080e-14 7.114e-03 0.000 1.000
## Mainland.Visitors -6.512e-15 2.541e-02 0.000 1.000
## Avg_SCHOOL_LEVEL_RANK 1.658e-07 7.788e+03 0.000 1.000
##
## Residual standard error: 47150 on 360308 degrees of freedom
## Multiple R-squared: 5.856e-24, Adjusted R-squared: -8.326e-06
## F-statistic: 7.033e-19 on 3 and 360308 DF, p-value: 1
# Diagnostic plots for the linear regression model
par(mfrow = c(2, 2)) # Set up a 2x2 plot grid
plot(model)
#Chi-Square Test for Categorical Data
# Perform Chi-square test for Cluster vs REGION
chi_test <- chisq.test(table(merged_data$Cluster, merged_data$REGION.x))
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be incorrect
print(chi_test)
##
## Pearson's Chi-squared test
##
## data: table(merged_data$Cluster, merged_data$REGION.x)
## X-squared = 359198, df = 2, p-value < 2.2e-16
#Normalization to 3NF:
colnames(merged_data)
## [1] "District" "Total.population"
## [3] "Male" "Female"
## [5] "X..15" "X15...24"
## [7] "X25...44" "X45...64"
## [9] "X65." "Divorces"
## [11] "English.speaking" "No.schooling..Pre.primary"
## [13] "Post.secondary..Degree.course" "Study.in.the.same.district.3."
## [15] "Working.population" "Employers"
## [17] "REGION.x" "Post_Secondary_Percentage"
## [19] "Rank_Score" "Cluster"
## [21] "REGION.y" "Total_Parking"
## [23] "Year" "Housing_Price"
## [25] "Date" "Control.Point"
## [27] "Arrival...Departure" "Hong.Kong.Residents"
## [29] "Mainland.Visitors" "Other.Visitors"
## [31] "Total" "REGION.y.y"
## [33] "Avg_SCHOOL_NO" "Most_Common_ENGLISH_CATEGORY"
## [35] "Avg_SESSION_RANK" "Avg_ENGLISH_CATEGORY_RANK"
## [37] "Avg_FINANCE_TYPE_RANK" "Avg_SCHOOL_LEVEL_RANK"
## [39] "Avg_TOTAL_RANK" "Avg_RANK"
## [41] "Most_Common_SCHOOL_LEVEL"
library(dplyr)
#Structuring the Data into Separate Tables (for 2NF):
# Demographic Data Table (District as Primary Key)
district_data <- merged_data %>%
select(District, Total.population, Male, Female, Divorces)
# Housing Data Table (District as Foreign Key)
housing_data <- merged_data %>%
select(District, Housing_Price, Total_Parking)
# School Data Table (District as Foreign Key)
school_data <- merged_data %>%
select(District, Avg_SCHOOL_LEVEL_RANK, Avg_TOTAL_RANK)
# Traffic Data Table (District as Foreign Key)
traffic_data <- merged_data %>%
select(District, Hong.Kong.Residents, Mainland.Visitors)
###Create the Scoring System:
# Min-Max scaling function to scale values between 0 and 10
normalize_score <- function(x) {
return (10 * (x - min(x)) / (max(x) - min(x)))
}
# Score each characteristic
merged_data$Housing_Price_Score <- normalize_score(merged_data$Housing_Price)
merged_data$Avg_School_Rank_Score <- 10 - normalize_score(merged_data$Avg_SCHOOL_LEVEL_RANK) # Higher rank gets higher score
merged_data$Total_Parking_Score <- normalize_score(merged_data$Total_Parking)
merged_data$Post_Secondary_Percentage_Score <- normalize_score(merged_data$Post_Secondary_Percentage)
merged_data$Divorces_Score <- normalize_score(merged_data$Divorces)
merged_data$Total_Residents_Score <- normalize_score(merged_data$Hong.Kong.Residents)
merged_data$Total_Visitors_Score <- normalize_score(merged_data$Mainland.Visitors)
# Combine scores for each district
merged_data$Total_Score <- rowSums(merged_data[c("Housing_Price_Score", "Avg_School_Rank_Score", "Total_Parking_Score",
"Post_Secondary_Percentage_Score", "Divorces_Score", "Total_Residents_Score",
"Total_Visitors_Score")])
# Normalize the total score (if necessary)
merged_data$Total_Score <- normalize_score(merged_data$Total_Score)
# Sort districts by Total_Score and display the top 3
top_districts <- merged_data %>%
arrange(desc(Total_Score)) %>%
head(3)
# Display the top 3 districts along with their REGION (NTE or NTW)
top_districts <- top_districts[, c("District", "Total_Score", "REGION.x")]
# Visualization of the top 3 districts, including region
ggplot(top_districts, aes(x = reorder(District, Total_Score), y = Total_Score, fill = REGION.x)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("NTE" = "lightgreen", "NTW" = "lightblue")) + # Custom colors for NTE and NTW
theme_minimal() +
labs(title = "Top 3 Districts by Total Score", x = "District", y = "Total Score", fill = "Region") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Print the result
print(top_districts)
## District Total_Score REGION.x
## 1 NORTH 10.000000 NTE
## 2 NORTH 9.932890 NTE
## 3 NORTH 9.906578 NTE