# import libraries
import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT
from psycopg2.extras import RealDictCursor


# create database
conn = psycopg2.connect(host='localhost');

conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT);
cursor = conn.cursor();

cursor.execute("DROP database tinder_swindler;")
cursor.execute("CREATE DATABASE tinder_swindler;")

cursor.close()
conn.close()


# create tables within database
with psycopg2.connect(host='localhost', dbname='tinder_swindler') as conn:
    with conn.cursor(cursor_factory=RealDictCursor) as cursor:

        cursor.execute(
            '''
            DROP TABLE IF EXISTS weeks;
            CREATE TABLE weeks (
                id SERIAL PRIMARY KEY,
                year_week VARCHAR
            );

            DROP TABLE IF EXISTS regions;
            CREATE TABLE regions (
                id SERIAL PRIMARY KEY,
                name TEXT
            );
            
            INSERT INTO regions(name) VALUES ('Global'); -- always 1 for 'Global'
                   
            DROP TABLE IF EXISTS viewerships;
            CREATE TABLE viewerships (
                id SERIAL PRIMARY KEY,
                week_id INTEGER,
                region_id INTEGER,
                rank INTEGER,
                hours_viewed INTEGER
            );
                 
            DROP TABLE IF EXISTS location_names;
            CREATE TABLE location_names (
                id SERIAL PRIMARY KEY,
                region_id INTEGER,          -- region_id only there if successfully geocoded.
                informal_location VARCHAR,  -- we hope to map this to region_id
                geocode_raw TEXT
            );
            
            DROP TABLE IF EXISTS days;
            CREATE TABLE days (
                id SERIAL PRIMARY KEY,
                week_id INTEGER,
                date DATE
            );
            
            DROP TABLE IF EXISTS stock_observations;
            CREATE TABLE stock_observations (
                id SERIAL PRIMARY KEY,
                day_id INTEGER,
                company TEXT,
                close_price NUMERIC,
                volume INTEGER
            );
                        
            DROP TABLE IF EXISTS tweets;
            CREATE TABLE tweets (
                id SERIAL PRIMARY KEY,
                day_id INTEGER,
                location_name_id INTEGER,
                text VARCHAR
            );
            '''
        )


# install required packages
%pip install emoji
%pip install geopy


# import additional libraries
import csv
import re
import emoji
from datetime import datetime
from geopy.extra.rate_limiter import RateLimiter
from geopy.geocoders import Nominatim


def tweet_date(date):
    # parsing date in tweets
    only_datetime = date[:date.index("+")]
    return datetime.strptime(only_datetime, "%Y-%m-%d %H:%M:%S").date()

def get_week(date): 
    # get week in format YYYY_WW
    if date.isocalendar()[1]/10 >= 1:
        return str(date.year) + '_'+ str(date.isocalendar()[1])
    else:
        return str(date.year) + '_0'+ str(date.isocalendar()[1])

def show_week(date):
    # get week in format YYYY_WW for show metrics
    date = datetime.strptime(date, "%Y-%m-%d").date()
    if date.isocalendar()[1]/10 >= 1:
        return str(date.year) + '_'+ str(date.isocalendar()[1])
    else:
        return str(date.year) + '_0'+ str(date.isocalendar()[1])
    
def stock_date(date):
    # parse date for stocks
    return datetime.strptime(date, "%m/%d/%Y").date()

def remove_emoji(string):
    # remove emoji from tweets user_location
    return emoji.replace_emoji(string, replace='').strip()

def remove_sign(string):
    # remove "$" on stock price, converting to float
    return float(string.replace("$",""))


with psycopg2.connect(host='localhost', dbname='tinder_swindler') as conn:
    with conn.cursor(cursor_factory=RealDictCursor) as cursor:

        with open('global_all_weeks.csv') as csvfile:
            myCSVReader = csv.DictReader(csvfile, delimiter=",", quotechar='"')
            
            for row in myCSVReader:
                if row['show_title'] != 'The Tinder Swindler':
                    continue
                
                week_num = show_week(row['week'])
                
                # need to get the week_id
                sql = """
                  INSERT INTO weeks(year_week) 
                  VALUES (%(year_week)s)
                  RETURNING id
                  """
                
                param_dict = {"year_week": week_num}
                
                cursor.execute(sql, param_dict)
                week_id = cursor.fetchone()['id']
                
                # also need to insert viewership data
                viewership_sql = """
                   INSERT into viewerships(week_id, region_id, rank, hours_viewed)
                   VALUES (%(week_id)s, %(region_id)s, %(weekly_rank)s, %(weekly_hours_viewed)s)
                """
                
                param_dict = {'week_id': week_id,
                              'region_id': 1,   # hard coded region for Global
                              'weekly_rank': row['weekly_rank'],
                              'weekly_hours_viewed': row['weekly_hours_viewed']}
                
                cursor.execute(viewership_sql, param_dict)


with psycopg2.connect(host='localhost', dbname='tinder_swindler') as conn:
    with conn.cursor(cursor_factory=RealDictCursor) as cursor:

        with open('countries_all_weeks.csv') as csvfile:
            myCSVReader = csv.DictReader(csvfile, delimiter=",", quotechar='"')
            
            for row in myCSVReader:
                if row['show_title'] != 'The Tinder Swindler':
                    continue
                
                # check if week_id already exist: fetch id, else: insert and get id
                week_num = show_week(row['week'])
                w_dict = {"year_week": week_num}
                              
                cursor.execute("SELECT id FROM weeks WHERE year_week = %(year_week)s", w_dict)
                
                if (cursor.rowcount == 1):
                    week_id = cursor.fetchone()['id']
                else:
                    cursor.execute("""
                                    INSERT INTO weeks(year_week) 
                                    VALUES (%(year_week)s)
                                    RETURNING id
                                    """, w_dict)
                    week_id = cursor.fetchone()['id']
                
                # check if region_id already exist: fetch id, else: insert and get id
                region = row['country_name']
                r_dict = {"region": region}
                
                cursor.execute("SELECT id FROM regions WHERE name = %(region)s", r_dict)
                
                if (cursor.rowcount == 1):
                    region_id = cursor.fetchone()['id']
                else:
                    cursor.execute("""
                                    INSERT into regions(name)
                                    VALUES (%(region)s)
                                    RETURNING id
                                    """, r_dict)
                    region_id = cursor.fetchone()['id']
                
                # also need to insert viewership data
                viewership_sql = """
                  INSERT into viewerships(week_id, region_id, rank, hours_viewed)
                  VALUES (%(week_id)s, %(region_id)s, %(weekly_rank)s, %(weekly_hours_viewed)s)
                """
                
                param_dict = {'week_id': week_id,
                              'region_id': region_id,
                              'weekly_rank': row['weekly_rank'],
                              'weekly_hours_viewed': None}
                
                cursor.execute(viewership_sql, param_dict)


with psycopg2.connect(host='localhost', dbname='tinder_swindler') as conn:
    with conn.cursor(cursor_factory=RealDictCursor) as cursor:

        with open('match_group_stocks.csv') as csvfile:
            myCSVReader = csv.DictReader(csvfile, delimiter=",", quotechar='"')
            
            for row in myCSVReader:
                date = stock_date(row['Date'])
                week_num = get_week(date)
                price = remove_sign(row['Close/Last'])
                
                # check if week_id already exist: fetch id, else: insert and get id
                w_dict = {"year_week": week_num}
                cursor.execute("SELECT id FROM weeks WHERE year_week = %(year_week)s", w_dict)
                
                if (cursor.rowcount == 1):
                    week_id = cursor.fetchone()['id']
                else:
                    cursor.execute("""
                                    INSERT INTO weeks(year_week) 
                                    VALUES (%(year_week)s)
                                    RETURNING id
                                    """, w_dict)
                    week_id = cursor.fetchone()['id']
                
                # insert date, get id
                sql = """
                  INSERT INTO days(week_id, date) 
                  VALUES (%(week_id)s, %(date)s)
                  RETURNING id
                  """
                
                d_dict = {"week_id": week_id, "date": date}
                
                cursor.execute(sql, d_dict)
                day_id = cursor.fetchone()['id']                
                
                # also insert stock_observations data
                stock_sql = """
                            INSERT into stock_observations(day_id, company, close_price, volume)
                            VALUES (%(day_id)s, %(company)s, %(close_price)s, %(volume)s)
                            """
                
                param_dict = {'day_id': day_id,
                              'company': 'Match Group',
                              'close_price': price,
                              'volume': row['Volume']}
                
                cursor.execute(stock_sql, param_dict)


with psycopg2.connect(host='localhost', dbname='tinder_swindler') as conn:
    with conn.cursor(cursor_factory=RealDictCursor) as cursor:

        with open('match_group_stocks.csv') as csvfile:
            myCSVReader = csv.DictReader(csvfile, delimiter=",", quotechar='"')
            
            for row in myCSVReader:
                date = stock_date(row['Date'])
                price = remove_sign(row['Close/Last'])
                
                # get date id since dates are same as match group stocks
                d_dict = {"week_id": week_id, "date": date}
                cursor.execute("SELECT id FROM days WHERE date = %(date)s", d_dict)
                
                if (cursor.rowcount == 1):
                    day_id = cursor.fetchone()['id']
                
                # also insert stock_observations data
                stock_sql = """
                            INSERT into stock_observations(day_id, company, close_price, volume)
                            VALUES (%(day_id)s, %(company)s, %(close_price)s, %(volume)s)
                            """
                
                param_dict = {'day_id': day_id,
                              'company': 'Netflix',
                              'close_price': price,
                              'volume': row['Volume']}
                
                cursor.execute(stock_sql, param_dict)


with psycopg2.connect(host='localhost', dbname='tinder_swindler') as conn:
    with conn.cursor(cursor_factory=RealDictCursor) as cursor:
        
        with open('Tweets.csv') as csvfile:
            myCSVReader = csv.DictReader(csvfile, delimiter=",", quotechar='"')
            
            for row in myCSVReader:
                # insert informal_location
                location = remove_emoji(row['user_location'])
                l_dict = {"location": location}
                cursor.execute("""
                                SELECT id FROM location_names
                                WHERE informal_location = %(location)s
                                """, l_dict)
                
                if (cursor.rowcount == 1):
                    location_id = cursor.fetchone()['id']
                else:
                    cursor.execute("""
                                    INSERT INTO location_names(informal_location) 
                                    VALUES (%(location)s)
                                    RETURNING id
                                    """, l_dict)
                    location_id = cursor.fetchone()['id']
                
                # need to compare date, get day_id
                date = tweet_date(row['date'])
                d_dict = {"date": date}
                cursor.execute("SELECT id FROM days WHERE date = %(date)s", d_dict)
                
                if (cursor.rowcount == 1):
                    day_id = cursor.fetchone()['id']
                
                # insert row['text'] and get location_names_id
                tweets_sql = """
                            INSERT into tweets(day_id, location_name_id, text)
                            VALUES (%(day_id)s, %(location_id)s, %(text)s)
                            """
                
                param_dict = {'day_id': day_id,
                              'location_id': location_id,
                              'text': row['text']}
                
                cursor.execute(tweets_sql, param_dict)


# Fill null values or empty string with 'None' to speed up the process of geocoding
with psycopg2.connect(host='localhost', dbname='tinder_swindler') as conn:
    with conn.cursor(cursor_factory=RealDictCursor) as cursor:
        
        update_location = """
        UPDATE location_names
        SET region_id = %(region_id)s
        WHERE id = %(location_name_id)s
        """
        
        cursor.execute("""
            SELECT id, informal_location
            FROM location_names
            WHERE (informal_location IS NULL
                AND geocode_raw IS NULL)
                OR (informal_location = '')
            """)
        
        results = cursor.fetchall()
        
        for row in results:
            # insert 'None' when informal_location and geocode_raw is NULL
            # store in the geocode_raw 
            none_geocode_sql = """
            UPDATE location_names
            SET geocode_raw = %(geocode_raw)s
            WHERE id = %(location_name_id)s
            """
            
            location = geocode(row['informal_location'], language='en')
            
            param_dict = {'geocode_raw': 'None',
                        'location_name_id': row['id']}
            print(param_dict['geocode_raw'])
            cursor.execute(none_geocode_sql, param_dict)


# Now parse informal_location to geocode_raw
with psycopg2.connect(host='localhost', dbname='tinder_swindler') as conn:
    with conn.cursor(cursor_factory=RealDictCursor) as cursor:
        
        update_location = """
        UPDATE location_names
        SET region_id = %(region_id)s
        WHERE id = %(location_name_id)s
        """
        
        cursor.execute("""
            SELECT id, informal_location
            FROM location_names
            WHERE geocode_raw IS NULL
            LIMIT 1000
            """) # limit parsing, otherwise, timeout error
        
        # setup the geocoder.
        geolocator = Nominatim(user_agent = "geoapiExercises")
        # creates a function we will call later.
        geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)
        
        results = cursor.fetchall()
        
        for row in results:
            # geocode the location_string
            # store in the geocode_raw 
            cache_geocode_sql = """
            UPDATE location_names
            SET geocode_raw = %(geocode_raw)s
            WHERE id = %(location_name_id)s
            """
            # print(row)
            location = geocode(row['informal_location'], language='en')
            
            param_dict = {'geocode_raw': str(location), # location.raw will give us a JSON file with latitude and longtitude
                             'location_name_id': row['id']}
            #print(param_dict['geocode_raw'])
            cursor.execute(cache_geocode_sql, param_dict)
            
            """ NOTE: original code causes timeout issue as it kept trying to parse NULL rows that had been parsed before
            if location is not None:
                
                param_dict = {'geocode_raw': str(location), # location.raw will give us a JSON file with latitude and longtitude
                             'location_name_id': row['id']}
                print(param_dict)
                cursor.execute(cache_geocode_sql, param_dict)
            """


# Next, we update region_id in location_names table
with psycopg2.connect(host='localhost', dbname='tinder_swindler') as conn:
    with conn.cursor(cursor_factory=RealDictCursor) as cursor:
        
        update_location = """
        UPDATE location_names
        SET region_id = %(region_id)s
        WHERE id = %(location_name_id)s
        """
        
        cursor.execute("""
            SELECT id, geocode_raw
            FROM location_names
            WHERE geocode_raw IS NOT NULL
            """)
        results = cursor.fetchall()
        
        for row in results:
            location = row['geocode_raw'].split(",")[-1].strip()
            
            if location == 'None':
                region_id = 1
            else:
                # find or create a region in the regions table
                sql_select_region = """
                            SELECT id
                            FROM regions
                            WHERE name = %(location)s
                            """
                cursor.execute(sql_select_region, {"location": location})
                
                # check if there is region_id result
                
                if (cursor.rowcount == 1): 
                    region_id = cursor.fetchone()['id']
                else:
                    region_id = 1
                
            cursor.execute(update_location, {"region_id": region_id, "location_name_id": row['id']})


%load_ext sql
%env DATABASE_URL=postgresql://localhost/tinder_swindler

env: DATABASE_URL=postgresql://localhost/tinder_swindler


%%sql
SELECT COUNT(DISTINCT informal_location) AS "location count",
  COUNT(DISTINCT geocode_raw) AS "geocode count"
FROM location_names
WHERE informal_location IS NOT NULL
  OR geocode_raw IS NOT NULL

1 rows affected.


%%sql
SELECT COUNT(geocode_raw)
FROM location_names
WHERE geocode_raw IS NULL

 * postgresql://localhost/tinder_swindler
1 rows affected.


%%sql
select text, date, region_id, location_name_id, informal_location, geocode_raw
from tweets
    JOIN days ON tweets.day_id = days.id
    JOIN location_names ON tweets.location_name_id = location_names.id
where geocode_raw IS NOT NULL
LIMIT 10

 * postgresql://localhost/tinder_swindler
10 rows affected.


import psycopg2
from psycopg2.extras import RealDictCursor
import csv


with psycopg2.connect(host='localhost', dbname='tinder_swindler') as conn:
    with conn.cursor(cursor_factory=RealDictCursor) as cursor:

        cursor.execute("""
                       SELECT  regions.id AS region_id, 
                            weeks.year_week, 
                            viewerships.hours_viewed,
                            COUNT(TEXT) OVER (PARTITION BY weeks.year_week) AS tweet_count
                        FROM tweets
                            JOIN days
                                ON tweets.day_id = days.id
                            JOIN weeks
                                ON days.week_id = weeks.id
                            JOIN location_names
                                ON tweets.location_name_id = location_names.id
                            JOIN regions
                                ON location_names.region_id = regions.id
                            JOIN viewerships
                                ON viewerships.week_id = weeks.id
                                    AND viewerships.region_id = regions.id
                        WHERE regions.id = 1
                        GROUP BY regions.id, 
                            weeks.year_week, 
                            viewerships.hours_viewed, 
                            tweets.text
                       """)
        
        with open('global-tweetcount-viewership.csv', 'w') as csvfile:
            # Declare csv fields in the order we want them
            column_names = ["region_id", "year_week", "hours_viewed", "tweet_count"]
            myCsvWriter = csv.DictWriter(csvfile,
                                         fieldnames=column_names)

            myCsvWriter.writeheader()

            for row in cursor:
                myCsvWriter.writerow(row)

print("Done writing csv")

Done writing csv


%pip install pandas


import pandas as pd


tweet_view_df = pd.read_csv('global-tweetcount-viewership.csv')

tweet_view_df[['year', 'week']] = tweet_view_df['year_week'].str.split("_", expand=True)
tweet_view_df.head()


import datetime
from dateutil.relativedelta import relativedelta

week = tweet_view_df['week'].str.replace("0","")
date = [datetime.date(2022, 1, 1) + relativedelta(weeks=+int(e)) for e in week]

tweet_view_df['date'] = pd.DataFrame (date, columns = ['date'])
tweet_view_df.drop(['region_id', 'year_week', 'year', 'week'], axis=1).head()


grouped_df = tweet_view_df.groupby('date')[['tweet_count', 'hours_viewed']].mean()
print(grouped_df)

            tweet_count  hours_viewed
date                                 
2022-02-05       7649.0    45800000.0
2022-02-12      24981.0    64700000.0

text	date	region_id	location_name_id	informal_location	geocode_raw
@T1gmee Tinder Swindler, off Netflix	2022-02-09	1	1	Travelling the World on Web3	None
#tinderswindler , basically homeboy got away with it	2022-02-09	1	2		None
Not the tinder swindler becoming a TIk Tok star 🙄 #FreeHushpuppi	2022-02-09	1	2		None
Why isn’t Shimon Hayut in jail ????? #thetinderswindler	2022-02-09	56	3	Noord-Brabant	North Brabant, Netherlands
the tinder swindler a sick nigga 😂😂	2022-02-09	81	4	PG/MD	Polígono Industrial de Fuencarral, Fuencarral, Madrid, Área metropolitana de Madrid y Corredor del Henares, Community of Madrid, 28001, Spain
No those women on the Tinder swindler pissed me the fuck off because how are you a “heir to a billionaire Diamond c… https://t.co/ULnjv2TBJ7	2022-02-09	12	5	Toronto, Ontario	Toronto, Golden Horseshoe, Ontario, Canada
This manipulation is how scammers pull you in to fall in love with em then in a few days or month they asking for c… https://t.co/weKMMAnmew	2022-02-09	1	2		None
@robertstweets1 @DistantFixed I waited for him to say three out the list then shafted him back! It was fun actually… https://t.co/A0SQM1VkJw	2022-02-09	91	6	East, England	East (Andrews) Park, Cultural Quarter, The Polygon, Southampton, South East England, England, SO14 0DA, United Kingdom
Hmmm...he's like the #tinderswindler https://t.co/O9T3mCAJ5T	2022-02-09	79	7	Johannesburg, South Africa	Johannesburg, City of Johannesburg Metropolitan Municipality, Gauteng, 2001, South Africa
Wow just watching the Twitter Swindler and it reminds me so much of Ron & Kate Bensimon Leviev duo - Australian dia… https://t.co/jRS5nCMoEW	2022-02-09	1	2		None

The Tinder Swindler Analysis¶

Summary¶

Datasets¶

Database Design¶

Project Flowchart¶

Data Wrangling Process¶

Database Creation¶

Loading Data into Database¶

Data exploration using SQL queries¶

Exporting Data using SQL¶

Data Analysis¶

Challenges¶

	region_id	year_week	hours_viewed	tweet_count	year	week
0	1	2022_05	45800000	7649	2022	05
1	1	2022_05	45800000	7649	2022	05
2	1	2022_05	45800000	7649	2022	05
3	1	2022_05	45800000	7649	2022	05
4	1	2022_05	45800000	7649	2022	05

	hours_viewed	tweet_count	date
0	45800000	7649	2022-02-05
1	45800000	7649	2022-02-05
2	45800000	7649	2022-02-05
3	45800000	7649	2022-02-05
4	45800000	7649	2022-02-05