import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
Bar and Scatter
Bar
= [25,26,27,28,29,30,31,32,33,34,35]
ages_x = [38496, 42000, 46752, 49320, 53200, 56000, 62316, 64928, 67317, 68748, 73752]
dev_y "ggplot")
plt.style.use(= "k", linestyle = "--", label = "Normal developers")
plt.bar(ages_x, dev_y, color "Median Salaries by Age")
plt.title("Age")
plt.xlabel("Median Salaries (USD)")
plt.ylabel(
plt.legend() plt.tight_layout()
= [25,26,27,28,29,30,31,32,33,34,35]
ages_x = [38496, 42000, 46752, 49320, 53200, 56000, 62316, 64928, 67317, 68748, 73752]
dev_y = [45372, 48476, 53850, 57287, 63016, 65998, 70003, 70000, 71496, 75370, 83640]
py_dev_y = [37810, 43515, 46823, 49293, 53437, 56373, 62375, 66674, 68745, 68746, 74583]
js_dev_y "ggplot")
plt.style.use(= "#5a7d9a", linestyle = '-',linewidth = 4, label = "Python_Developers")
plt.plot(ages_x, py_dev_y, color = "#adad3b", linestyle = '-',linewidth = 4, label = "Java_script_Developers")
plt.plot(ages_x, js_dev_y, color = "k", linestyle = "--", label = "Normal developers")
plt.bar(ages_x, dev_y, color "Median Salaries by Age")
plt.title("Age")
plt.xlabel("Median Salaries (USD)")
plt.ylabel(
plt.legend() plt.tight_layout()
Side-by-side
= [25,26,27,28,29,30,31,32,33,34,35]
ages_x = [38496, 42000, 46752, 49320, 53200, 56000, 62316, 64928, 67317, 68748, 73752]
dev_y = [45372, 48476, 53850, 57287, 63016, 65998, 70003, 70000, 71496, 75370, 83640]
py_dev_y = [37810, 43515, 46823, 49293, 53437, 56373, 62375, 66674, 68745, 68746, 74583]
js_dev_y "ggplot")
plt.style.use(= "#5a7d9a", linestyle = '-',linewidth = 4, label = "Python_Developers")
plt.bar(ages_x, py_dev_y, color = "#adad3b", linestyle = '-',linewidth = 4, label = "Java_script_Developers")
plt.bar(ages_x, js_dev_y, color = "k", linestyle = "--", label = "Normal developers")
plt.bar(ages_x, dev_y, color "Median Salaries by Age")
plt.title("Age")
plt.xlabel("Median Salaries (USD)")
plt.ylabel(
plt.legend() plt.tight_layout()
= [25,26,27,28,29,30,31,32,33,34,35]
ages_x = np.arange(len(ages_x))
x_index = 0.25
width = [38496, 42000, 46752, 49320, 53200, 56000, 62316, 64928, 67317, 68748, 73752]
dev_y = [45372, 48476, 53850, 57287, 63016, 65998, 70003, 70000, 71496, 75370, 83640]
py_dev_y = [37810, 43515, 46823, 49293, 53437, 56373, 62375, 66674, 68745, 68746, 74583]
js_dev_y "ggplot")
plt.style.use(- width, py_dev_y, width = width, color = "#5a7d9a", linestyle = '-', label = "Python_Developers")
plt.bar(x_index = width, color = "#adad3b", linestyle = '-', label = "Java_script_Developers")
plt.bar(x_index, js_dev_y, width + width, dev_y, width = width, color = "k", linestyle = "--", label = "Normal developers")
plt.bar(x_index "Median Salaries by Age")
plt.title("Age")
plt.xlabel("Median Salaries (USD)")
plt.ylabel(
plt.legend() plt.tight_layout()
# plt.xticks()
= [25,26,27,28,29,30,31,32,33,34,35]
ages_x = np.arange(len(ages_x)) # creating an index value np array for each value of age
x_index = 0.25
width = [38496, 42000, 46752, 49320, 53200, 56000, 62316, 64928, 67317, 68748, 73752]
dev_y = [45372, 48476, 53850, 57287, 63016, 65998, 70003, 70000, 71496, 75370, 83640]
py_dev_y = [37810, 43515, 46823, 49293, 53437, 56373, 62375, 66674, 68745, 68746, 74583]
js_dev_y "ggplot")
plt.style.use(- width, py_dev_y, width = width, color = "#5a7d9a", linestyle = '-', label = "Python_Developers")
plt.bar(x_index = width, color = "#adad3b", linestyle = '-', label = "Java_script_Developers")
plt.bar(x_index, js_dev_y, width + width, dev_y, width = width, color = "k", linestyle = "--", label = "Normal developers")
plt.bar(x_index "Median Salaries by Age")
plt.title(= x_index, labels = ages_x)
plt.xticks(ticks "Age")
plt.xlabel("Median Salaries (USD)")
plt.ylabel(
plt.legend() plt.tight_layout()
barh
= pd.read_csv("data/data.csv")
data data.head()
Responder_id | LanguagesWorkedWith | |
---|---|---|
0 | 1 | HTML/CSS;Java;JavaScript;Python |
1 | 2 | C++;HTML/CSS;Python |
2 | 3 | HTML/CSS |
3 | 4 | C;C++;C#;Python;SQL |
4 | 5 | C++;HTML/CSS;Java;JavaScript;Python;SQL;VBA |
= data["Responder_id"]
ids = data["LanguagesWorkedWith"] lang_responses
from collections import Counter
= Counter()
lang_counter for response in lang_responses:
";")) lang_counter.update(response.split(
lang_counter
Counter({'JavaScript': 59219,
'HTML/CSS': 55466,
'SQL': 47544,
'Python': 36443,
'Java': 35917,
'Bash/Shell/PowerShell': 31991,
'C#': 27097,
'PHP': 23030,
'C++': 20524,
'TypeScript': 18523,
'C': 18017,
'Other(s):': 7920,
'Ruby': 7331,
'Go': 7201,
'Assembly': 5833,
'Swift': 5744,
'Kotlin': 5620,
'R': 5048,
'VBA': 4781,
'Objective-C': 4191,
'Scala': 3309,
'Rust': 2794,
'Dart': 1683,
'Elixir': 1260,
'Clojure': 1254,
'WebAssembly': 1015,
'F#': 973,
'Erlang': 777})
= []
language = []
popularity for item in lang_counter.most_common(15):
0])
language.append(item[1]) popularity.append(item[
print(language)
print(popularity)
['JavaScript', 'HTML/CSS', 'SQL', 'Python', 'Java', 'Bash/Shell/PowerShell', 'C#', 'PHP', 'C++', 'TypeScript', 'C', 'Other(s):', 'Ruby', 'Go', 'Assembly']
[59219, 55466, 47544, 36443, 35917, 31991, 27097, 23030, 20524, 18523, 18017, 7920, 7331, 7201, 5833]
plt.bar(language, popularity)"Popularity of Programming languages")
plt.title("Popularity")
plt.ylabel("Programming language") plt.xlabel(
Text(0.5, 0, 'Programming language')
# 위의 그래프에서 X축 label이 명확하지 않다는 것을 알 수 있습니다. 따라서 가로 막대 그래프를 사용합니다.
plt.barh(language, popularity)"Popularity of Programming languages")
plt.title("Popularity")
plt.xlabel("Programming language") plt.ylabel(
Text(0, 0.5, 'Programming language')
# 가장 인기 있는 언어를 상단에 유지하여 막대 그래프를 반전시킬 수 있습니다.
language.reverse()
popularity.reverse()
plt.barh(language, popularity)"Popularity of Programming languages")
plt.title("Popularity")
plt.xlabel("Programming language") plt.ylabel(
Text(0, 0.5, 'Programming language')
Scatter Plots
import pandas as pd
import matplotlib.pyplot as plt
= [5,7,8,5,6,7,9,2,3,4,4,4,2,6,3,6,8,6,4,1]
x = [7,4,3,9,1,3,2,5,2,4,8,7,1,6,4,9,7,7,5,1]
y
plt.scatter(x,y) plt.tight_layout()
Customizing the scatter plots
# 점의 크기 변경
= [5,7,8,5,6,7,9,2,3,4,4,4,2,6,3,6,8,6,4,1]
x = [7,4,3,9,1,3,2,5,2,4,8,7,1,6,4,9,7,7,5,1]
y = 200)
plt.scatter(x,y, s plt.tight_layout()
# 점의 색상 변경
= [5, 7, 8, 5, 6, 7, 9, 2, 3, 4, 4, 4, 2, 6, 3, 6, 8, 6, 4, 1]
x = [7, 4, 3, 9, 1, 3, 2, 5, 2, 4, 8, 7, 1, 6, 4, 9, 7, 7, 5, 1]
y =150, color="green")
plt.scatter(x, y, s plt.tight_layout()
# 그래프의 마커 변경하기
= [5,7,8,5,6,7,9,2,3,4,4,4,2,6,3,6,8,6,4,1]
x = [7,4,3,9,1,3,2,5,2,4,8,7,1,6,4,9,7,7,5,1]
y = 150, color = "green", marker = "X")
plt.scatter(x,y, s plt.tight_layout()
# 마커에 가장자리와 알파값 부여하기
= [5,7,8,5,6,7,9,2,3,4,4,4,2,6,3,6,8,6,4,1]
x = [7,4,3,9,1,3,2,5,2,4,8,7,1,6,4,9,7,7,5,1]
y = 150, c = "green", edgecolor = "black", linewidth = 2, alpha = 0.75)
plt.scatter(x,y, s plt.tight_layout()
Using differnt colors for markers.
= [5,7,8,5,6,7,9,2,3,4,4,4,2,6,3,6,8,6,4,1]
x = [7,4,3,9,1,3,2,5,2,4,8,7,1,6,4,9,7,7,5,1]
y = [7,5,3,9,5,7,2,5,3,7,1,2,8,1,9,2,5,6,7,5]
colors = 150, c = colors, cmap = "Reds", edgecolor = "black", linewidth = 2, alpha = 0.75)
plt.scatter(x,y, s plt.tight_layout()
# Setting a colorbar legend
= [5, 7, 8, 5, 6, 7, 9, 2, 3, 4, 4, 4, 2, 6, 3, 6, 8, 6, 4, 1]
x = [7, 4, 3, 9, 1, 3, 2, 5, 2, 4, 8, 7, 1, 6, 4, 9, 7, 7, 5, 1]
y = [7, 5, 3, 9, 5, 7, 2, 5, 3, 7, 1, 2, 8, 1, 9, 2, 5, 6, 7, 5]
colors
plt.scatter(=150, c=colors, cmap="Reds", edgecolor="black", linewidth=2, alpha=0.75
x, y, s
)= plt.colorbar()
cbar "Satisfaction Ratings")
cbar.set_label( plt.tight_layout()
= [5,7,8,5,6,7,9,2,3,4,4,4,2,6,3,6,8,6,4,1]
x = [7,4,3,9,1,3,2,5,2,4,8,7,1,6,4,9,7,7,5,1]
y = [7,5,3,9,5,7,2,5,3,7,1,2,8,1,9,2,5,6,7,5]
colors = [209,486,381,255,717,315,175,228,174,592,293,399,255,525,154,253,475,457,214,253]
sizes = sizes, c = colors, cmap = "Reds", edgecolor = "black", linewidth = 2, alpha = 0.75)
plt.scatter(x,y, s = plt.colorbar()
cbar "Satisfaction Ratings")
cbar.set_label( plt.tight_layout()
Scatter plot for a CSV file data
import pandas as pd
import matplotlib.pyplot as plt
= pd.read_csv("data/yt.csv")
data data.head()
view_count | likes | ratio | |
---|---|---|---|
0 | 8036001 | 324742 | 96.91 |
1 | 9378067 | 562589 | 98.19 |
2 | 2182066 | 273650 | 99.38 |
3 | 6525864 | 94698 | 96.25 |
4 | 9481284 | 582481 | 97.22 |
# 조회 수에 따라 좋아요 수가 증가하는지 확인
= data["view_count"]
views = data["likes"]
likes = data["ratio"]
ratio
= "black", linewidth = 2, alpha = 0.75)
plt.scatter(views, likes, edgecolor
"Trending Youtube videos")
plt.title("Number of views")
plt.xlabel("Number of likes")
plt.ylabel(
plt.tight_layout()
= data["view_count"]
views = data["likes"]
likes = data["ratio"]
ratio
= "black", linewidth = 2, alpha = 0.75)
plt.scatter(views, likes, edgecolor
"Trending Youtube videos")
plt.title("Number of views")
plt.xlabel("Number of likes")
plt.ylabel(
"log")
plt.xscale("log")
plt.yscale( plt.tight_layout()
# 좋아요/싫어요 비율을 색상 매개변수로 사용하여 데이터를 더 잘 설명할 수 있음
= data["view_count"]
views = data["likes"]
likes = data["ratio"]
ratio
= ratio, cmap = "summer", edgecolor = "black", linewidth = 2, alpha = 0.75)
plt.scatter(views, likes, c
= plt.colorbar()
cbar "Like to Dislike Ratio")
cbar.set_label(
"Trending Youtube videos")
plt.title("Number of views")
plt.xlabel("Number of likes")
plt.ylabel(
"log")
plt.xscale("log")
plt.yscale( plt.tight_layout()
Plotting Time Series Data
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mpl_dates
from datetime import datetime, timedelta
= [
date 2019,5,24),
datetime (2019,5,25),
datetime (2019,5,26),
datetime (2019,5,27),
datetime (2019,5,28),
datetime (2019,5,29),
datetime (2019,5,30)
datetime (
]
= [0,1,3,4,6,5,7]
y
plt.plot(date, y)
= [
date 2019,5,24),
datetime (2019,5,25),
datetime (2019,5,26),
datetime (2019,5,27),
datetime (2019,5,28),
datetime (2019,5,29),
datetime (2019,5,30)
datetime (
]
= [0,1,3,4,6,5,7]
y
= "solid") plt.plot(date, y, linestyle
= [
date 2019,5,24),
datetime (2019,5,25),
datetime (2019,5,26),
datetime (2019,5,27),
datetime (2019,5,28),
datetime (2019,5,29),
datetime (2019,5,30)
datetime (
]
= [0,1,3,4,6,5,7]
y
= "solid")
plt.plot(date, y, linestyle # gcf (get current figure) autofmt(auto format)
plt.gcf().autofmt_xdate() plt.tight_layout()
Changing the format of dates
= [
date 2019,5,24),
datetime (2019,5,25),
datetime (2019,5,26),
datetime (2019,5,27),
datetime (2019,5,28),
datetime (2019,5,29),
datetime (2019,5,30)
datetime (
]
= [0,1,3,4,6,5,7]
y
= "solid")
plt.plot(date, y, linestyle
plt.gcf().autofmt_xdate() = mpl_dates.DateFormatter("%d, %b,%Y")
date_format # gca = get current axis
plt.gca().xaxis.set_major_formatter(date_format) plt.tight_layout()
Using the datetime Plot on CSV file
= pd.read_csv("data/datetime.csv") data
data.head()
Date | Open | High | Low | Close | Adj Close | Volume | |
---|---|---|---|---|---|---|---|
0 | 2019-05-18 | 7266.080078 | 8281.660156 | 7257.259766 | 8193.139648 | 8193.139648 | 723011166 |
1 | 2019-05-19 | 8193.139648 | 8193.139648 | 7591.850098 | 7998.290039 | 7998.290039 | 637617163 |
2 | 2019-05-20 | 7998.290039 | 8102.319824 | 7807.770020 | 7947.930176 | 7947.930176 | 357803946 |
3 | 2019-05-21 | 7947.930176 | 8033.759766 | 7533.660156 | 7626.890137 | 7626.890137 | 424501866 |
4 | 2019-05-22 | 7626.890137 | 7971.259766 | 7478.740234 | 7876.500000 | 7876.500000 | 386766321 |
= data["Date"]
price_date = data["Close"] price_close
= "solid")
plt.plot(price_date, price_close, linestyle
plt.gcf().autofmt_xdate()"Bitcoin Prices")
plt.title("Dates")
plt.xlabel("Price in (USD)") plt.ylabel(
Text(0, 0.5, 'Price in (USD)')
# We are making use of pandas to_datetime method
"Date"] = pd.to_datetime(data["Date"])
data['Date', inplace = True)
data.sort_values(= data["Date"]
price_date = data["Close"]
price_close = "solid")
plt.plot(price_date, price_close, linestyle
plt.gcf().autofmt_xdate()"Bitcoin Prices")
plt.title("Dates")
plt.xlabel("Price in (USD)") plt.ylabel(
Text(0, 0.5, 'Price in (USD)')