airbnb_data_preprocessing.py

# coding: utf-8
import os
import warnings
from pathlib import Path
from functools import reduce
from itertools import chain
from collections import Counter

import numpy as np
import pandas as pd
import gender_guesser.detector as gender
from sklearn.preprocessing import MultiLabelBinarizer

warnings.filterwarnings("ignore")

DATA_PATH = Path("data/airbnb")
fname = "listings.csv.gz"
if not os.path.exists(DATA_PATH):
    os.makedirs(DATA_PATH)

df_original = pd.read_csv(DATA_PATH / fname)
print(df_original.shape)
df_original.head()

# this is just subjective. One can choose some other columns
keep_cols = [
    "id",
    "host_id",
    "description",
    "house_rules",
    "host_name",
    "host_listings_count",
    "host_identity_verified",
    "neighbourhood_cleansed",
    "latitude",
    "longitude",
    "is_location_exact",
    "property_type",
    "room_type",
    "accommodates",
    "bathrooms",
    "bedrooms",
    "beds",
    "amenities",
    "price",
    "security_deposit",
    "cleaning_fee",
    "guests_included",
    "extra_people",
    "minimum_nights",
    "instant_bookable",
    "cancellation_policy",
    "reviews_per_month",
]

df = df_original[keep_cols]
df = df[~df.reviews_per_month.isna()]
df = df[~df.description.isna()]
df = df[~df.host_listings_count.isna()]
print(df.shape)

# This is a preprocessing stage before preparing the data to be passed to WideDeep
# Let's go "column by column"

# house rules
#
# I will simply include a binary column with 1/0 if the property has/has not
# house rules.
df["has_house_rules"] = df["house_rules"]
df.has_house_rules.fillna(0, inplace=True)
df["has_house_rules"][df.has_house_rules != 0] = 1
df.drop("house_rules", axis=1, inplace=True)

# host_name
#
# I will use names to infer gender using `gender_guesser`

host_name = df.host_name.tolist()
d = gender.Detector()
host_gender = [d.get_gender(n) for n in host_name]
replace_dict = {"mostly_male": "male", "mostly_female": "female", "andy": "unknown"}
host_gender = [replace_dict.get(item, item) for item in host_gender]
Counter(host_gender)
df["host_gender"] = host_gender
df.drop("host_name", axis=1, inplace=True)
df.head()

# property_type, room_type, accommodates, bathrooms, bedrooms, beds and
# guests_included, host_listings_count, minimum_nights
#
# Here some standard pre-processing
df.property_type.value_counts()
replace_prop_type = [
    val
    for val in df.property_type.unique().tolist()
    if val not in ["Apartment", "House"]
]
replace_prop_type = {k: "other" for k in replace_prop_type}
df.property_type.replace(replace_prop_type, inplace=True)
df["property_type"] = df.property_type.apply(lambda x: "_".join(x.split(" ")).lower())

df.room_type.value_counts()
df["room_type"] = df.room_type.apply(lambda x: "_".join(x.split(" ")).lower())

df["bathrooms"][(df.bathrooms.isna()) & (df.room_type == "private_room")] = 0
df["bathrooms"][(df.bathrooms.isna()) & (df.room_type == "entire_home/apt")] = 1
df.bedrooms.fillna(1, inplace=True)
df.beds.fillna(1, inplace=True)

# Encode some as categorical
categorical_cut = [
    ("accommodates", 3),
    ("guests_included", 3),
    ("minimum_nights", 3),
    ("host_listings_count", 3),
    ("bathrooms", 1.5),
    ("bedrooms", 3),
    ("beds", 3),
]

for col, cut in categorical_cut:
    new_colname = col + "_catg"
    df[new_colname] = df[col].apply(lambda x: cut if x >= cut else x)
    df[new_colname] = df[new_colname].round().astype(int)

# Amenities
#
# I will just add a number of dummy columns with 1/0 if the property has/has
# not that particular amenity
amenity_repls = (
    ('"', ""),
    ("{", ""),
    ("}", ""),
    (" / ", "_"),
    ("/", "_"),
    (" ", "_"),
    ("(s)", ""),
)

amenities_raw = df.amenities.str.lower().tolist()
amenities = [
    reduce(lambda a, kv: a.replace(*kv), amenity_repls, s).split(",")
    for s in amenities_raw
]

all_amenities = list(chain(*amenities))
all_amenities_count = Counter(all_amenities)
all_amenities_count

# having a look to the list we see that one amenity is empty and two are
# "translation missing:..."
keep_amenities = []
for k, v in all_amenities_count.items():
    if k and "missing" not in k:
        keep_amenities.append(k)

final_amenities = [
    [amenity for amenity in house_amenities if amenity in keep_amenities]
    for house_amenities in amenities
]

# some properties have no amenities aparently
final_amenities = [
    ["no amenities"] if not amenity else amenity for amenity in final_amenities
]
final_amenities = [
    ["amenity_" + amenity for amenity in amenities] for amenities in final_amenities
]

# let's build the dummy df
df_list_of_amenities = pd.DataFrame({"groups": final_amenities}, columns=["groups"])
s = df_list_of_amenities["groups"]

mlb = MultiLabelBinarizer()

df_amenities = pd.DataFrame(mlb.fit_transform(s), columns=mlb.classes_, index=df.index)

df.drop("amenities", axis=1, inplace=True)
df = pd.concat([df, df_amenities], axis=1)
df.head()

# Price, security_deposit, cleaning_fee, extra_people

money_columns = ["price", "security_deposit", "cleaning_fee", "extra_people"]
tmp_money_df = df[money_columns].fillna("$0")

money_repls = (("$", ""), (",", ""))
for col in money_columns:
    val_str = tmp_money_df[col].tolist()
    val_num = [
        float(st)
        for st in [
            reduce(lambda a, kv: a.replace(*kv), money_repls, s) for s in val_str
        ]
    ]
    tmp_money_df[col] = val_num

high_price, high_deposit, high_cleaning_fee, high_extra_people = 1000, 2000, 200, 100

high_price_count = (tmp_money_df.price >= high_price).sum()
high_deposit_count = (tmp_money_df.security_deposit >= high_deposit).sum()
high_cleaning_fee_count = (tmp_money_df.cleaning_fee >= high_cleaning_fee).sum()
high_extra_people_count = (tmp_money_df.extra_people >= high_extra_people).sum()

print("properties with very high price: {}".format(high_price_count))
print("properties with very high security deposit: {}".format(high_deposit_count))
print("properties with very high cleaning fee: {}".format(high_cleaning_fee_count))
print("properties with very high extra people cost: {}".format(high_extra_people_count))

# We will now just concat and we will drop high values later one
df.drop(money_columns, axis=1, inplace=True)
df = pd.concat([df, tmp_money_df], axis=1)
df = df[
    (df.price < high_price)
    & (df.price != 0)
    & (df.security_deposit < high_deposit)
    & (df.cleaning_fee < high_cleaning_fee)
    & (df.extra_people < high_extra_people)
]
df.head()
print(df.shape)

# let's make sure there are no nan left
has_nan = df.isnull().any(axis=0)
has_nan = [df.columns[i] for i in np.where(has_nan)[0]]
if not has_nan:
    print("no NaN, all OK")

# Computing a proxi for yield

# Yield is defined as price * occupancy rate. Occupancy rate can be calculated
# by multiplying ((reviews / review rate) * average length of stay), where
# review rate and average length of stay are normally taken as a factor based
# in some model.  For example, in the San Francisco model a review rate of 0.5
# is used to convert reviews to estimated bookings (i.e. we assume that only
# half of the guests will leave a review). An average length of stay of 3
# nights  multiplied by the estimated bookings over a period gives the
# occupancy rate. Therefore, in the expression I have used below, if you want
# to turn my implementation of 'yield' into a "proper" one under the San
# Francisco model assumptions simply multiply my yield by 6 (3 * (1/0.5)) or
# by 72 (3 * 2 * 12) if you prefer per year.

df["yield"] = (df["price"] + df["cleaning_fee"]) * (df["reviews_per_month"])
df.drop(["price", "cleaning_fee", "reviews_per_month"], axis=1, inplace=True)
# we will focus in cases with yield below 600 (we lose ~3% of the data).
# No real reason for this, simply removing some "outliers"
df = df[df["yield"] <= 600]
df.to_csv(DATA_PATH / "listings_processed.csv", index=False)
print("data preprocessed finished. Final shape: {}".format(df.shape))