formating

aureliuszi · aureliuszi · commit e72d7518865e · 2021-12-10T15:54:12.000-05:00
diff --git a/guides/Upload-SQL-MariaDB/ReadMe.md b/guides/Upload-SQL-MariaDB/ReadMe.md
@@ -1,3 +1,7 @@
+# Upload a file to SQL database
+
+**This code demonstrates how to upload files into SQL.** The files should begin as a `.csv` file, typically from a dataframe.
+
 Useful rescources with more detail: 
 
 https://grid.rcs.hbs.org/importing
@@ -14,27 +18,26 @@ https://grid.rcs.hbs.org/import-and-export-text-files
 **B. Create Database** - enter the GRID (or SecureCRT) and use the following commands on the command line interface.
 1. `mysql`
 2. `use agoldenberg_twitter_data;`
-3. `create table TABLENAME (Column_1 char(20), Column_2 char(20), Column_3 char(20));` - this command should have also appeared in the file formatter above
-4. `quit;` - exit the database interface to import the data
+3. `create table table_import (Column_1 char(20), Column_2 char(20), Column_3 char(20));` - this command should have also appeared in the file formatter above
 
-Create Import Directory
-1. mkdir /export/mdb_external/import/username
-2. chmod 700 /export/mdb_external/import/username
+**Create Import Directory**
+1. `mkdir /export/mdb_external/import/username`
+2. `chmod 700 /export/mdb_external/import/username`
 
-Move File
-1. pwd #Get the full directory of your desktop where your file is
-returns for example: /export/home/rcsguest/rcs_username/Desktop
-2. mv /export/home/rcsguest/rcs_username/Desktop/SampleData.txt /export/mdb_external/import/username
+**Move File**
+1. `pwd` #Get the full directory of your desktop where your file is
+returns for example: `/export/home/rcsguest/rcs_username/Desktop`
+2. `mv /export/home/rcsguest/rcs_username/Desktop/SampleData.txt /export/mdb_external/import/username`
 
-Import File
-1. mysql
-2. use agoldenberg_DATABASENAME;
-3. load data local infile '/export/mdb_external/import/username/SampleData.txt' into table table_import fields terminated by '|' lines terminated by '\n' ignore 1 lines;
+**Import File**
+1. `mysql`
+2. `use agoldenberg_DATABASENAME;`
+3. `load data local infile '/export/mdb_external/import/username/SampleData.txt' into table table_import fields terminated by '|' lines terminated by '\n' ignore 1 lines;`
 
-Check Data
-1.Describe table_import;
-2. SELECT * FROM table_import;
+**Check Data**
+1. `Describe TABLENAME;`
+2. `SELECT * FROM TABLENAME;`
 
-Remove temp-import files & folder
+**Remove temp-import files & folder**
 1. rm -rf /export/mdb_external/import/username
 
diff --git a/guides/Upload-SQL-MariaDB/csv-to-txt-for-SQL.py b/guides/Upload-SQL-MariaDB/csv-to-txt-for-SQL.py
@@ -4,9 +4,9 @@
 
 # Defining files and locations
 
-in_filename = "renamed_columns_tweets_all" + ".csv"
+in_filename = "all_api_sentiment_tweets" + ".csv"
 file_location = "data"
-out_filename = "renamed_columns_tweets_all" + ".txt"
+out_filename = "all_api_sentiment_tweets" + ".txt"
 db_table_name = "agoldenberg_twitter_hook"
 
 # Functions
@@ -17,13 +17,19 @@ def csv_reader(in_filename, file_location):
 
     pd.set_option('display.float_format', lambda x: '%.5f' % x)
     pd.set_option('display.max_colwidth', None)
-    dtypes = {'user_id': 'str', 'tweet_id': 'str', 'in_reply_to_status': 'str', 'retweeted_status_id': 'str',
-              'quoted_status_id': 'str'}
+    dtypes = {'user_id': 'str', 'tweet_id': 'str', 'in_reply_to_status': 'str', 'retweeted_status_id': 'str','quoted_status_id': 'str'}
 
     df = pd.read_csv(relative_file_path,  # reading in the original dataset
                      error_bad_lines=False,
                      index_col=False,
                      dtype=dtypes)
+
+    # dtypes = {'tweet_id': 'str'}
+    #
+    # df = pd.read_csv(relative_file_path,  # reading in the original dataset
+    #                  error_bad_lines=False,
+    #                  index_col=False,
+    #                  dtype=dtypes)
     #df.update(df.select_dtypes(include=np.number).applymap('{:,g}'.format)) # remove trailing 0s
     df = df.convert_dtypes()
     return df
@@ -35,6 +41,7 @@ def indexer(df):
                   and not col.startswith('inserted_at')
                   and not col.startswith('id')] # remove unwanted columns
     df = df[filter_col]
+    print("indexer "+ df.columns)
     if 'created_at' in df.columns:
         df.loc[:,'created_at'] = pd.to_datetime(df['created_at'])
 
@@ -303,36 +310,82 @@ def command_builder(df,db_table_name):
             col_name = col
             SQL_datatype = "text,"
             col_command = col_name + " " + SQL_datatype + " "
+
+        ## for sentiment analysis columns
+        elif col == "tweet_length":
+            col_name = col
+            SQL_datatype = "int,"
+            col_command = col_name + " " + SQL_datatype + " "
+
+        elif col == "vader_tweet_pos":
+            col_name = col
+            SQL_datatype = "DOUBLE,"
+            col_command = col_name + " " + SQL_datatype + " "
+
+        elif col == "vader_tweet_neg":
+            col_name = col
+            SQL_datatype = "DOUBLE,"
+            col_command = col_name + " " + SQL_datatype + " "
+
+        elif col == "vader_tweet_compound":
+            col_name = col
+            SQL_datatype = "DOUBLE,"
+            col_command = col_name + " " + SQL_datatype + " "
+
+        elif col == "vader_tweet_neu":
+            col_name = col
+            SQL_datatype = "DOUBLE,"
+            col_command = col_name + " " + SQL_datatype + " "
+
+        elif col == "vader_tweet_category":
+            col_name = col
+            SQL_datatype = "text,"
+            col_command = col_name + " " + SQL_datatype + " "
+
+        elif col == "vader_tweet_sent_dict":
+            col_name = col
+            SQL_datatype = "text,"
+            col_command = col_name + " " + SQL_datatype + " "
+
+        elif col == "vader_tweet_sent_dict":
+            col_name = col
+            SQL_datatype = "text,"
+            col_command = col_name + " " + SQL_datatype + " "
+
         else:
             print("The column name  '" + col + "' is not part of the regular list of variables and needs to be added manually")
         string_col_commands = string_col_commands + col_command
         sql_command = build_command + string_col_commands + ");"
     return(sql_command)
-    print(sql_command)
+
 
 def pipe_remover(df):
     for col in df:
         col_name = col
         pandas_datatype = str(df[col].infer_objects().dtypes)
         print(col)
         print(pandas_datatype)
-        if pandas_datatype == "string":
+        if pandas_datatype == "string" and col !="tweet_id":
             print("TRUE")
             df[col_name] = df[col_name].str.replace("|", "")
             df[col_name] = df[col_name].str.replace('"', "")
             df[col_name] = df[col_name].str.replace("\n", "")
             df[col_name] = df[col_name].str.replace("\r", "")
+        print(df.columns+ " pipe_remover")
     return df
 
 
 def csv_to_SQL_formatter(in_filename, file_location, out_filename, db_table_name):
     df = csv_reader(in_filename, file_location) # read file
     df = indexer(df)
     df = pipe_remover(df)
+    print(df.columns)
     sql_command = command_builder(df,db_table_name)
     current_path = os.getcwd()
     relative_file_path = os.path.join(current_path, file_location, out_filename)
     df.to_csv(relative_file_path, header=True, index=False, sep='|', mode='a')
+    print(df.columns)
+    print(sql_command)
     return(sql_command)
 
 # Calling function