Skip to content

Commit e72d751

Browse files
committed
formating
1 parent a2600dd commit e72d751

File tree

2 files changed

+79
-23
lines changed

2 files changed

+79
-23
lines changed

guides/Upload-SQL-MariaDB/ReadMe.md

Lines changed: 20 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
# Upload a file to SQL database
2+
3+
**This code demonstrates how to upload files into SQL.** The files should begin as a `.csv` file, typically from a dataframe.
4+
15
Useful rescources with more detail:
26

37
https://grid.rcs.hbs.org/importing
@@ -14,27 +18,26 @@ https://grid.rcs.hbs.org/import-and-export-text-files
1418
**B. Create Database** - enter the GRID (or SecureCRT) and use the following commands on the command line interface.
1519
1. `mysql`
1620
2. `use agoldenberg_twitter_data;`
17-
3. `create table TABLENAME (Column_1 char(20), Column_2 char(20), Column_3 char(20));` - this command should have also appeared in the file formatter above
18-
4. `quit;` - exit the database interface to import the data
21+
3. `create table table_import (Column_1 char(20), Column_2 char(20), Column_3 char(20));` - this command should have also appeared in the file formatter above
1922

20-
Create Import Directory
21-
1. mkdir /export/mdb_external/import/username
22-
2. chmod 700 /export/mdb_external/import/username
23+
**Create Import Directory**
24+
1. `mkdir /export/mdb_external/import/username`
25+
2. `chmod 700 /export/mdb_external/import/username`
2326

24-
Move File
25-
1. pwd #Get the full directory of your desktop where your file is
26-
returns for example: /export/home/rcsguest/rcs_username/Desktop
27-
2. mv /export/home/rcsguest/rcs_username/Desktop/SampleData.txt /export/mdb_external/import/username
27+
**Move File**
28+
1. `pwd` #Get the full directory of your desktop where your file is
29+
returns for example: `/export/home/rcsguest/rcs_username/Desktop`
30+
2. `mv /export/home/rcsguest/rcs_username/Desktop/SampleData.txt /export/mdb_external/import/username`
2831

29-
Import File
30-
1. mysql
31-
2. use agoldenberg_DATABASENAME;
32-
3. load data local infile '/export/mdb_external/import/username/SampleData.txt' into table table_import fields terminated by '|' lines terminated by '\n' ignore 1 lines;
32+
**Import File**
33+
1. `mysql`
34+
2. `use agoldenberg_DATABASENAME;`
35+
3. `load data local infile '/export/mdb_external/import/username/SampleData.txt' into table table_import fields terminated by '|' lines terminated by '\n' ignore 1 lines;`
3336

34-
Check Data
35-
1.Describe table_import;
36-
2. SELECT * FROM table_import;
37+
**Check Data**
38+
1. `Describe TABLENAME;`
39+
2. `SELECT * FROM TABLENAME;`
3740

38-
Remove temp-import files & folder
41+
**Remove temp-import files & folder**
3942
1. rm -rf /export/mdb_external/import/username
4043

guides/Upload-SQL-MariaDB/csv-to-txt-for-SQL.py

Lines changed: 59 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@
44

55
# Defining files and locations
66

7-
in_filename = "renamed_columns_tweets_all" + ".csv"
7+
in_filename = "all_api_sentiment_tweets" + ".csv"
88
file_location = "data"
9-
out_filename = "renamed_columns_tweets_all" + ".txt"
9+
out_filename = "all_api_sentiment_tweets" + ".txt"
1010
db_table_name = "agoldenberg_twitter_hook"
1111

1212
# Functions
@@ -17,13 +17,19 @@ def csv_reader(in_filename, file_location):
1717

1818
pd.set_option('display.float_format', lambda x: '%.5f' % x)
1919
pd.set_option('display.max_colwidth', None)
20-
dtypes = {'user_id': 'str', 'tweet_id': 'str', 'in_reply_to_status': 'str', 'retweeted_status_id': 'str',
21-
'quoted_status_id': 'str'}
20+
dtypes = {'user_id': 'str', 'tweet_id': 'str', 'in_reply_to_status': 'str', 'retweeted_status_id': 'str','quoted_status_id': 'str'}
2221

2322
df = pd.read_csv(relative_file_path, # reading in the original dataset
2423
error_bad_lines=False,
2524
index_col=False,
2625
dtype=dtypes)
26+
27+
# dtypes = {'tweet_id': 'str'}
28+
#
29+
# df = pd.read_csv(relative_file_path, # reading in the original dataset
30+
# error_bad_lines=False,
31+
# index_col=False,
32+
# dtype=dtypes)
2733
#df.update(df.select_dtypes(include=np.number).applymap('{:,g}'.format)) # remove trailing 0s
2834
df = df.convert_dtypes()
2935
return df
@@ -35,6 +41,7 @@ def indexer(df):
3541
and not col.startswith('inserted_at')
3642
and not col.startswith('id')] # remove unwanted columns
3743
df = df[filter_col]
44+
print("indexer "+ df.columns)
3845
if 'created_at' in df.columns:
3946
df.loc[:,'created_at'] = pd.to_datetime(df['created_at'])
4047

@@ -303,36 +310,82 @@ def command_builder(df,db_table_name):
303310
col_name = col
304311
SQL_datatype = "text,"
305312
col_command = col_name + " " + SQL_datatype + " "
313+
314+
## for sentiment analysis columns
315+
elif col == "tweet_length":
316+
col_name = col
317+
SQL_datatype = "int,"
318+
col_command = col_name + " " + SQL_datatype + " "
319+
320+
elif col == "vader_tweet_pos":
321+
col_name = col
322+
SQL_datatype = "DOUBLE,"
323+
col_command = col_name + " " + SQL_datatype + " "
324+
325+
elif col == "vader_tweet_neg":
326+
col_name = col
327+
SQL_datatype = "DOUBLE,"
328+
col_command = col_name + " " + SQL_datatype + " "
329+
330+
elif col == "vader_tweet_compound":
331+
col_name = col
332+
SQL_datatype = "DOUBLE,"
333+
col_command = col_name + " " + SQL_datatype + " "
334+
335+
elif col == "vader_tweet_neu":
336+
col_name = col
337+
SQL_datatype = "DOUBLE,"
338+
col_command = col_name + " " + SQL_datatype + " "
339+
340+
elif col == "vader_tweet_category":
341+
col_name = col
342+
SQL_datatype = "text,"
343+
col_command = col_name + " " + SQL_datatype + " "
344+
345+
elif col == "vader_tweet_sent_dict":
346+
col_name = col
347+
SQL_datatype = "text,"
348+
col_command = col_name + " " + SQL_datatype + " "
349+
350+
elif col == "vader_tweet_sent_dict":
351+
col_name = col
352+
SQL_datatype = "text,"
353+
col_command = col_name + " " + SQL_datatype + " "
354+
306355
else:
307356
print("The column name '" + col + "' is not part of the regular list of variables and needs to be added manually")
308357
string_col_commands = string_col_commands + col_command
309358
sql_command = build_command + string_col_commands + ");"
310359
return(sql_command)
311-
print(sql_command)
360+
312361

313362
def pipe_remover(df):
314363
for col in df:
315364
col_name = col
316365
pandas_datatype = str(df[col].infer_objects().dtypes)
317366
print(col)
318367
print(pandas_datatype)
319-
if pandas_datatype == "string":
368+
if pandas_datatype == "string" and col !="tweet_id":
320369
print("TRUE")
321370
df[col_name] = df[col_name].str.replace("|", "")
322371
df[col_name] = df[col_name].str.replace('"', "")
323372
df[col_name] = df[col_name].str.replace("\n", "")
324373
df[col_name] = df[col_name].str.replace("\r", "")
374+
print(df.columns+ " pipe_remover")
325375
return df
326376

327377

328378
def csv_to_SQL_formatter(in_filename, file_location, out_filename, db_table_name):
329379
df = csv_reader(in_filename, file_location) # read file
330380
df = indexer(df)
331381
df = pipe_remover(df)
382+
print(df.columns)
332383
sql_command = command_builder(df,db_table_name)
333384
current_path = os.getcwd()
334385
relative_file_path = os.path.join(current_path, file_location, out_filename)
335386
df.to_csv(relative_file_path, header=True, index=False, sep='|', mode='a')
387+
print(df.columns)
388+
print(sql_command)
336389
return(sql_command)
337390

338391
# Calling function

0 commit comments

Comments
 (0)